Example #1
0
    def _parse_issues(self, response):
        """Generate the list of issues for a manga

        @url https://mangadex.org/manga/39/one-piece
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """
        manga = response.meta['manga']
        chapters = response.meta['chapters']

        xp = '//tr[contains(@id,"chapter_")]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue()
            # Name
            xp = './/a/text()'
            issue['name'] = line.xpath(xp).extract_first()
            # Number
            xp = './/a/@data-chapter-num'
            issue['number'] = line.xpath(xp).extract()
            # Order
            issue['order'] = chapters - len(manga['issues'])
            # Language
            xp = './/img/@title'
            issue['language'] = line.xpath(xp).extract()
            # Release
            xp = './/time/@datetime'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './/a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)

        # Next page
        xp = '//ul[@class="pagination"]/li[@class="active"]' \
            '/following-sibling::li[@class="paging"]/a/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            meta = {
                'manga': manga,
                'chapters': chapters,
            }
            return response.follow(next_url, self._parse_issues, meta=meta)
        else:
            return manga
Example #2
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://kissmanga.com/Manga/Naruto
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//div[@class="barContent"]//a[@class="bigChar"]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//span[@class="info" and contains(text(), "%s")]' \
             '/following-sibling::a/text()'
        manga['alt_name'] = response.xpath(xp % 'Other name:').extract()
        # Author
        manga['author'] = response.xpath(xp % 'Author:').extract()
        # Artist
        manga['artist'] = manga['author']
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Genres
        manga['genres'] = response.xpath(xp % 'Genres:').extract()
        # Status
        xp = '//span[@class="info" and contains(text(), "%s")]' \
             '/following-sibling::text()[1]'
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Rank
        manga['rank'] = response.xpath(xp % 'Views:').re(r'(\d+).')
        # Rank order
        manga['rank_order'] = 'DESC'
        # Description
        xp = '//p[span[@class="info" and contains(text(), "%s")]]'\
             '/following-sibling::p[1]/text()'
        manga['description'] = response.xpath(xp % 'Summary:').extract()
        # Cover image
        xp = '//div[@id="rightside"]//img/@src'
        url = response.xpath(xp).extract_first()
        manga['image_urls'] = [response.urljoin(url)]

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//table[@class="listing"]/tr[td]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language='EN')
            # Name
            xp = './/a/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            # Some examples that this regex needs to address
            #   1/11 Vol.003 Ch.009.006: Omake 004-koma
            #   21st Century Boys 014
            #   Mob Psycho 100 Ch.099.001: Mob
            #   Mob Psycho 100 Ch.098.002
            #   Fantastic World Vol.001 Ch.002
            #   Black Clover 118 - Mage X
            #   Black Clover 099: Family
            xp = './/a/text()'
            number = line.xpath(xp).re(
                r'(?:[Cc]h.|[Ee]p.|[Cc]haper|[Pp]art.)(\d[.\d]+)'
                r'|(\d[.\d]+)[ :-]+'
                r'|(\d[.\d]+)$')
            issue['number'] = number
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './td[2]/text()'
            issue['release'] = line.xpath(xp).re(r'\d{1,2}/\d{1,2}/\d{4}')
            # URL
            xp = './/a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        return manga
Example #3
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://bato.to
        @returns items 1 100
        @returns request 0 1
        @scrapes url name issues
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        xp = './/tr[contains(@class, "row")]'
        last_row, manga = None, None
        for update in response.xpath(xp):
            row = update.xpath('@class').extract()[0].split()[0]
            if row != last_row:
                if manga:
                    yield manga
                manga = Manga(issues=[])
                # Name
                xp = 'td[2]/a[2]/text()'
                manga['name'] = update.xpath(xp).extract()
                # URL
                xp = 'td[2]/a[2]/@href'
                manga['url'] = update.xpath(xp).extract()
            else:
                issue = Issue()
                # Name
                xp = './/td/a[img/@style="vertical-align:middle;"]/text()'
                issue['name'] = update.xpath(xp).extract()
                # Number
                issue['number'] = update.xpath(xp).re(
                    r'Ch.(?:Story )?([.\d]+)')
                # Order
                # This is only an estimation for now
                issue['order'] = issue['number']
                # Language
                xp = './/td/div/@title'
                issue['language'] = update.xpath(xp).extract()
                # Release
                xp = './/td[last()]/text()'
                issue['release'] = update.xpath(xp).extract()
                # URL
                xp = './/td/a[img/@style="vertical-align:middle;"]/@href'
                url = update.xpath(xp).extract()[0]
                issue['url'] = response.urljoin(url)

                # Check if is a new update
                update_date = convert_to_date(issue['release'][0].strip())
                if update_date < until:
                    return

                manga['issues'].append(issue)
            last_row = row

        # Return the last manga
        if manga:
            yield manga

        # Next page
        xp = '//a[@title="Older Releases"]/@href'
        next_url = response.xpath(xp).extract()
        if next_url:
            next_url = response.urljoin(next_url[0])
            meta = {'until': until}
            yield scrapy.Request(next_url, self.parse_latest, meta=meta)
Example #4
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://www.mangahere.cc/manga/angel_densetsu/
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # MangaHere returns 200 for 404 pages
        xp = '//div[@class="error_404"]'
        if response.xpath(xp).extract():
            return

        # Check if manga is licensed
        xp = '//div[@class="detail_list"]/div[@class="mt10 color_ff00 mb10"]'
        if response.xpath(xp).extract():
            return

        # URL
        manga['url'] = response.url
        # Name
        xp = '//meta[@property="og:title"]/@content'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//li[label[contains(text(),"%s")]]/text()'
        manga['alt_name'] = response.xpath(
            xp % 'Alternative Name:').re(r'([^;]+)')
        # Author
        xp = '//li[label[contains(text(),"%s")]]/a/text()'
        manga['author'] = response.xpath(xp % 'Author(s):').extract()
        # Artist
        manga['artist'] = response.xpath(xp % 'Artist(s):').extract()
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status
        xp = '//li[label[contains(text(),"%s")]]/text()'
        manga['status'] = response.xpath(xp % 'Status:').extract_first()
        # Genres
        manga['genres'] = response.xpath(xp % 'Genre(s):').re(r'([^,]+)')
        # Rank
        manga['rank'] = response.xpath(xp % 'Rank:').extract()
        # Rank order
        manga['rank_order'] = 'ASC'
        # Description
        xp = '//li[label[contains(text(),"%s")]]/p[@id="show"]/text()'
        manga['description'] = response.xpath(xp % 'Summary:').extract()
        # Cover image
        xp = '//img[@class="img"]/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//div[@class="detail_list"]/ul[not(@class)]/li'
        lines = response.xpath(xp)

        # Check if the lines are empty
        if len(lines) == 1 and 'No Manga Chapter' in lines[0].extract():
            return

        for line in lines:
            issue = Issue(language='EN')
            # Name
            xp = './/a/text()'
            name_1 = line.xpath(xp).extract()
            xp = './/span[@class="mr6"]/text()'
            name_2 = line.xpath(xp).extract()
            xp = './/span[@class="left"]/text()'
            name_3 = line.xpath(xp).extract()
            issue['name'] = name_1 + name_2 + name_3
            # Number
            xp = './/a/text()'
            issue['number'] = line.xpath(xp).re(
                r'([.\d]+)\s*$')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './/span[@class="right"]/text()'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './/a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        return manga
Example #5
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://www.mangahere.co/latest/
        @returns items 1 100
        @returns request 0 1
        @scrapes url name issues
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        xp = '//div[@class="manga_updates"]/dl'
        for update in response.xpath(xp):
            # Check if is a new update
            xp = './/span[@class="time"]/text()'
            update_date = update.xpath(xp).extract()
            update_date = convert_to_date(update_date[0])
            if update_date < until:
                return

            manga = Manga()
            # Name
            xp = './/a[@class="manga_info"]/text()'
            manga['name'] = update.xpath(xp).extract()
            # URL
            xp = './/a[@class="manga_info"]/@href'
            url = update.xpath(xp).extract()
            manga['url'] = response.urljoin(url[0])

            # Parse the manga issues list
            manga['issues'] = []
            xp = './dd'
            for line in update.xpath(xp):
                issue = Issue(language='EN')
                # Name
                xp = 'a/text()'
                issue['name'] = line.xpath(xp).extract()
                # Number
                xp = 'a/text()'
                issue['number'] = line.xpath(xp).re(r'([.\d]+)\s*$')
                # Order
                # This is only an estimation for now
                issue['order'] = issue['number']
                # Release
                issue['release'] = update_date
                # URL
                xp = 'a/@href'
                url = line.xpath(xp).extract()
                issue['url'] = response.urljoin(url[0])
                manga['issues'].append(issue)
            yield manga

        # Next page
        xp = '//a[@class="next"]/@href'
        next_url = response.xpath(xp).extract()
        if next_url:
            next_url = response.urljoin(next_url[0])
            meta = {'until': until}
            yield scrapy.Request(next_url, self.parse_latest, meta=meta)
Example #6
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://bato.to/comic/_/comics/angel-densetsu-r460
        @returns items 1 1
        @returns request 0 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        # Batoto can have different URLs for comics (_/comics/,
        # _/sp/), so here we update the manga with the real one.
        manga['url'] = response.url
        # Name
        xp = '//h1[@class="ipsType_pagetitle"]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//td[contains(text(),"%s")]/following-sibling::td/*/text()'
        manga['alt_name'] = response.xpath(xp % 'Alt Names:').re(r'([^,;]+)')
        # Author
        manga['author'] = response.xpath(xp % 'Author:').extract()
        # Artist
        manga['artist'] = response.xpath(xp % 'Artist:').extract()
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status
        xp = '//td[contains(text(),"%s")]/following-sibling::td/text()'
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Genres
        xp = '//td[contains(text(),"%s")]/following-sibling::td/a/span/text()'
        manga['genres'] = response.xpath(xp % 'Genres:').extract()
        # Description
        xp = '//td[contains(text(),"%s")]/following-sibling::td/text()'
        manga['description'] = response.xpath(xp % 'Description:').extract()
        # Cover image
        xp = '//div[@class="ipsBox"]/div/div/img/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//tr[contains(@class,"chapter_row")' \
             ' and not(contains(@class,"chapter_row_expand"))]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue()
            # Name
            xp = './td[1]/a/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            issue['number'] = line.xpath(xp).re(r'Ch.(?:Story )?([.\d]+)')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Language
            xp = './td[2]/div/@title'
            issue['language'] = line.xpath(xp).extract()
            # Release
            xp = './td[5]/text()'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './td[1]/a/@href'
            url = line.xpath(xp).extract()[0]
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        yield manga
Example #7
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://submanga.org/bloody-cross
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description image_urls issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//a[@class="btn-link text-semibold text-mint"][2]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//span[contains(text(), "%s")]/following-sibling::text()'
        title = u'TĂ­tulos alternativos:'
        manga['alt_name'] = response.xpath(xp % title).re(r'([^,;]+)')
        # Author
        manga['author'] = response.xpath(xp % 'Autor:').extract()
        # Artist
        manga['artist'] = manga['author']
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status (Finalizado / En curso)
        xp = '//span[@class="text-2x text-thin"]/text()'
        manga['status'] = response.xpath(xp).extract()
        # Genres
        xp = '//span[contains(text(), "Generos:")]/following-sibling::a/text()'
        manga['genres'] = response.xpath(xp).extract()
        # Description
        xp = '//p[@class="text-justify"]/text()'
        manga['description'] = response.xpath(xp).extract()
        # Cover image
        xp = '//img[@class="img-cover-m"]/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//table[@id="caps-list"]//tr'
        lines = response.xpath(xp)
        for line in lines:
            xp = 'td[3]/a/img/@alt'
            langs = line.xpath(xp).extract()
            for lang in langs:
                issue = Issue(language=lang)
                # Name
                xp = 'td[2]/a/text()'
                issue['name'] = line.xpath(xp).extract()
                # Number
                xp = 'td[1]/a/text()'
                issue['number'] = line.xpath(xp).extract_first()
                # Order
                issue['order'] = int(issue['number'])
                # Release
                xp = 'td[4]/a/span/text()'
                issue['release'] = line.xpath(xp).extract()
                # URL
                xp = 'td[1]/a/@href'
                url = line.xpath(xp).extract_first()
                issue['url'] = '%s/%s' % (url, lang)
                manga['issues'].append(issue)
        return manga
Example #8
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://unionmangas.net/manga/bleach
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description image_urls issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//div[@class="col-md-12"]/h2/text()'
        manga['name'] = response.xpath(xp).extract_first()
        # Alternate name
        manga['alt_name'] = manga['name']
        # Author
        xp = '//label[contains(text(), "%s")]/following-sibling::text()'
        manga['author'] = response.xpath(xp % 'Autor:').extract()
        # Artist
        manga['artist'] = response.xpath(xp % 'Artista:').extract()
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status (Ativo / Completo)
        xp = '//label[contains(text(), "Status:")]' \
             '/following-sibling::span/text()'
        manga['status'] = response.xpath(xp).extract()
        # Genres
        xp = u'//label[contains(text(), "GĂȘnero(s):")]' \
             u'/following-sibling::a/text()'
        manga['genres'] = response.xpath(xp).extract()
        # Description
        xp = '//div[@class="panel-body"]/text()'
        manga['description'] = response.xpath(xp).extract()
        # Cover image
        xp = '//img[@class="img-thumbnail"]/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//div[@class="col-xs-6 col-md-6"]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language='PT')
            # Name
            xp = 'a/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            issue['number'] = line.xpath(xp).re(r'Cap. ([.\d]+)$')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = 'span/text()'
            issue['release'] = line.xpath(xp).re(r'\d{2}/\d{2}/\d{4}')
            # URL
            xp = 'a/@href'
            issue['url'] = line.xpath(xp).extract()
            manga['issues'].append(issue)
        return manga
Example #9
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://mangaseeonline.us/manga/Shingeki-No-Kyojin
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//h1/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//div/b[contains(text(),"%s")]/following-sibling::text()'
        manga['alt_name'] = response.xpath(xp %
                                           'Alternate Name(s):').re(r'([^,]+)')
        # Author
        xp = '//div/b[contains(text(),"%s")]/following-sibling::*/text()'
        manga['author'] = response.xpath(xp % 'Author(s):').extract()
        # Artist
        manga['artist'] = manga['author']
        # Reading direction
        manga['reading_direction'] = response.xpath(xp %
                                                    'Type:').extract_first()
        # Status
        manga['status'] = response.xpath(xp % 'Status:').extract_first()
        # Genres
        xp = '//div/b[contains(text(),"%s")]/following-sibling::*/text()'
        manga['genres'] = response.xpath(xp % 'Genre(s):').extract()
        # Rank order
        manga['rank_order'] = 'DESC'
        # Description
        xp = '//div[@class="description"]/text()'
        manga['description'] = response.xpath(xp).extract()
        # Cover image
        xp = '//div[contains(@class,"leftImage")]/img/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//a[@class="list-group-item"]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language='EN')
            # Name
            xp = './span/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            xp = './@chapter'
            issue['number'] = line.xpath(xp).extract()
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './time/@datetime'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)

        # Rank
        url = response.urljoin('subscribe.button.php')
        xp = '//input[@class="IndexName"]/@value'
        index_name = response.xpath(xp).extract_first()
        form_data = {'IndexName': index_name}
        meta = {'manga': manga}
        return scrapy.FormRequest(url,
                                  self._parse_subscribe,
                                  formdata=form_data,
                                  meta=meta)
Example #10
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://kissmanga.com/Manga/Naruto
        @returns items 1 1
        @returns request 0 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # Name
        xp = '//div[@class="barContent"]//a[@class="bigChar"]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//span[@class="info" and contains(text(), "%s")]' \
             '/following-sibling::a/text()'
        manga['alt_name'] = response.xpath(xp % 'Other name:').extract()
        # Author
        manga['author'] = response.xpath(xp % 'Author:').extract()
        # Artist
        manga['artist'] = manga['author']
        # Genres
        manga['genres'] = response.xpath(xp % 'Genres:').extract()
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status
        xp = '//span[@class="info" and contains(text(), "%s")]' \
             '/following-sibling::text()[1]'
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Rank
        manga['rank'] = response.xpath(xp % 'Views:').re(r'(\d+).')
        manga['rank_order'] = 'DESC'
        # Description
        xp = '//p[span[@class="info" and contains(text(), "%s")]]'\
             '/following-sibling::p[1]/text()'
        manga['description'] = response.xpath(xp % 'Summary:').extract()
        # Cover image
        xp = '//div[@id="rightside"]//img/@src'
        url = response.xpath(xp).extract()
        manga['image_urls'] = [response.urljoin(url[0])]

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//table[@class="listing"]/tr[td]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language='EN')
            # Name
            xp = './/a/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            xp = './/a/text()'
            number = line.xpath(xp).re(
                r'(?:[Vv]ol.[.\d]+)?\s*'
                r'(?:[Cc]h.|[Ee]p.|[Cc]haper|[Pp]art.)?(\d[.\d]+)')
            issue['number'] = number[0] if len(number) > 1 else number
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './td[2]/text()'
            issue['release'] = line.xpath(xp).re(r'\d{1,2}/\d{1,2}/\d{4}')
            # URL
            xp = './/a/@href'
            url = line.xpath(xp).extract()
            issue['url'] = response.urljoin(url[0])
            manga['issues'].append(issue)
        yield manga
Example #11
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url https://www.mangareader.net/178/angel-densetsu.html
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description image_urls issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//h2[@class="aname"]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//td[contains(text(),"%s")]/following-sibling::td/text()'
        _alt_name = response.xpath(xp % 'Alternate Name:')
        manga['alt_name'] = _alt_name.re(r'([^,;]+)')
        # Author
        manga['author'] = response.xpath(xp % 'Author:').extract()
        # Artist
        manga['artist'] = response.xpath(xp % 'Artist:').extract()
        # Reading direction
        rd = response.xpath(xp % 'Reading Direction:').extract_first()
        manga['reading_direction'] = ('RL' if rd == 'Right to Left' else 'LR')
        # Status
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Genres
        xp = '//span[@class="genretags"]/text()'
        manga['genres'] = response.xpath(xp).extract()
        # Description
        xp = '//div[@id="readmangasum"]/p/text()'
        manga['description'] = '\n'.join(response.xpath(xp).extract())
        # Cover image
        xp = '//div[@id="mangaimg"]/img/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//table[@id="listing"]/tr[td]'
        for line in response.xpath(xp):
            issue = Issue(language='EN')
            # Name
            xp = './/a/text()'
            name_1 = line.xpath(xp).extract()
            xp = './/a/following-sibling::text()'
            name_2 = line.xpath(xp).extract()
            issue['name'] = name_1 + name_2
            # Number
            xp = './/a/text()'
            issue['number'] = line.xpath(xp).re(r'([.\d]+)$')
            # Order
            issue['order'] = len(manga['issues'])
            # Release
            xp = './td[2]/text()'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './/a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        return manga
Example #12
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://www.mangareader.net/latest
        @returns items 1 30
        @returns request 0 1
        @scrapes url name issues
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        xp = '//table[@class="updates"]/tr[@class="c2"]'
        for update in response.xpath(xp):
            # Check if is a new update
            xp = './/td[@class="c1"]/text()'
            update_date = update.xpath(xp).extract()
            update_date = convert_to_date(update_date[0])
            if update_date < until:
                return

            manga = Manga()
            # Name
            xp = './/a[@class="chapter"]/strong/text()'
            manga['name'] = update.xpath(xp).extract()
            # URL
            xp = './/a[@class="chapter"]/@href'
            url = update.xpath(xp).extract()
            manga['url'] = response.urljoin(url[0])

            # Parse the manga issues list
            manga['issues'] = []
            xp = './/a[@class="chaptersrec"]'
            for line in update.xpath(xp):
                issue = Issue(language='EN')
                # Name
                xp = 'text()'
                issue['name'] = line.xpath(xp).extract()
                # Number
                xp = 'text()'
                issue['number'] = line.xpath(xp).re(r'(\d+)$')
                # Order
                # This is only an estimation for now
                issue['order'] = issue['number']
                # Release
                issue['release'] = update_date
                # URL
                xp = '@href'
                url = line.xpath(xp).extract()
                issue['url'] = response.urljoin(url[0])
                manga['issues'].append(issue)
            yield manga

        # Next page
        xp = '//div[@id="latest"]/div[@id="sp"]/a[contains(., ">")]/@href'
        next_url = response.xpath(xp).extract()
        if next_url:
            next_url = response.urljoin(next_url[0])
            meta = {'until': until}
            yield scrapy.Request(next_url, self.parse_latest, meta=meta)
Example #13
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://mangafox.me/manga/a_bias_girl/
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description image_urls issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # Check if manga is licensed
        xp = '//div[@class="warning" and contains(text(),"has been licensed")]'
        if response.xpath(xp).extract():
            return

        # URL
        manga['url'] = response.url
        # Name
        xp = '//title/text()'
        manga['name'] = response.xpath(xp).re(r'(.*) - Read')
        # Alternate name
        xp = '//div[@id="title"]/h3//text()'
        manga['alt_name'] = response.xpath(xp).extract()
        # Author
        xp = '//div[@id="title"]//tr[2]/td[2]/a/text()'
        manga['author'] = response.xpath(xp).extract()
        # Artist
        xp = '//div[@id="title"]//tr[2]/td[3]/a/text()'
        manga['artist'] = response.xpath(xp).extract()
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status
        xp = './/div[@class="data"][1]/span/text()'
        manga['status'] = response.xpath(xp).re(r'\w+')
        # Genres
        xp = '//div[@id="title"]//tr[2]/td[4]/a/text()'
        manga['genres'] = response.xpath(xp).extract()
        # Description
        xp = '//div[@id="title"]/p[@class="summary"]/text()'
        manga['description'] = response.xpath(xp).extract()
        # Cover image
        xp = '//div[@class="cover"]/img/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//ul[@class="chlist"]/li'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language='EN')
            # Name
            xp = '(.//h3|.//h4)/a/text()'
            name_1 = line.xpath(xp).extract()
            xp = '(.//h3|.//h4)/span[@class="title nowrap"]/text()'
            name_2 = line.xpath(xp).extract()
            issue['name'] = name_1 + name_2
            # Number
            xp = '(.//h3|.//h4)/a/text()'
            issue['number'] = line.xpath(xp).re(r'.*?([.\d]+)$')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './/span[@class="date"]/text()'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './/a[@class="tips"]/@href'
            issue['url'] = line.xpath(xp).extract()
            manga['issues'].append(issue)
        return manga
Example #14
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://mangafox.me/releases/
        @returns items 1 100
        @returns request 0 1
        @scrapes url name issues
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        xp = '//ul[@id="updates"]/li/div'
        for update in response.xpath(xp):
            manga = Manga()
            # Name
            xp = './/h3/a/text()'
            manga['name'] = update.xpath(xp).extract()
            # URL
            xp = './/h3/a//@href'
            manga['url'] = update.xpath(xp).extract()

            # Parse the manga issues list
            manga['issues'] = []
            xp = './/dt'
            for line in update.xpath(xp):
                # Check if is a new update
                xp = './/em/text()'
                update_date = update.xpath(xp).extract()
                update_date = convert_to_date(update_date[0])
                if update_date < until:
                    return

                issue = Issue(language='EN')
                # Name
                xp = './/span//text()'
                issue['name'] = line.xpath(xp).extract()
                # Number
                xp = './/span/a/text()'
                issue['number'] = line.xpath(xp).re(r'(\d+)$')
                # Order
                # This is only an estimation for now
                issue['order'] = issue['number']
                # Release
                issue['release'] = update_date
                # URL
                xp = './/span/a/@href'
                url = line.xpath(xp).extract()
                issue['url'] = urljoin(response.url, url[0])
                manga['issues'].append(issue)
            yield manga

        # Next page
        xp = '//a[span[@class="next"]]/@href'
        next_url = response.xpath(xp).extract()
        if next_url:
            next_url = urljoin(response.url, next_url[0])
            meta = {'until': until}
            yield scrapy.Request(next_url, self.parse_latest, meta=meta)
Example #15
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url https://bato.to/series/68329
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//h3[@class="item-title"]/a/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//div[@class="pb-2 alias-set hairlines-fade-bottom"]/text()'
        manga['alt_name'] = response.xpath(xp).extract_first().split('/')
        # Author
        xp = '//div[@class="attr-item"]/b[contains(text(),"%s")]' \
            '/following-sibling::span/*/text()'
        manga['author'] = response.xpath(xp % 'Authors:').extract_first()
        # Artist
        manga['artist'] = response.xpath(xp % 'Authors:').extract()[1:]
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status
        xp = '//div[@class="attr-item"]/b[contains(text(),"%s")]' \
            '/following-sibling::span/text()'
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Genres
        genres = response.xpath(xp % 'Genres:').extract()[-1]
        manga['genres'] = genres.split('/')
        # Rank
        rank = response.xpath(xp % 'Rank:').extract_first()
        manga['rank'] = rank.split(',')[0]
        # Rank order
        manga['rank_order'] = 'ASC'
        # Description
        xp = '//pre/text()'
        manga['description'] = response.xpath(xp).extract()
        # Cover image
        xp = '//img[@class="shadow-6"]/@src'
        url = response.xpath(xp).extract_first()
        manga['image_urls'] = [response.urljoin(url)]

        # Get language from the title flag
        xp = '//div[@class="mt-4 title-set"]/span/@class'
        language = response.xpath(xp).extract_first()
        language = language.split()[-1]

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//div[@class="main"]/div'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language=language)
            # Name
            xp = './a//text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            xp = './a/b/text()'
            issue['number'] = line.xpath(xp).re(r'Ch.(\d+)')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './/i/text()'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        return manga