Python MangaCrawlerItem Examples, manga_crawler.items.MangaCrawlerItem Python Examples

Example #1

0

Show file

File: hvtt.py Project: thosuperman/Kindlemanga

    def parse_item(self, response):
        """
        @url http://hocvientruyentranh.com/manga/2/shokugeki-no-souma-
        @returns items 1
        @scrapes name source total_chap chapters
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath('name', '//h3[@class="__name"]/text()', MapCompose(str.strip))
        manga.add_value('source', response.url)
        manga.add_xpath('image_src', '//*[@class="__image"]/img/@src')
        manga.add_value(
            'total_chap',
            max(
                [int(i) for i in
                    manga.get_xpath(
                        '//*[@class="table table-hover"]/tbody//tr//td//a//text()',
                        MapCompose(lambda x: re.findall(r'\d+', x)))]
            )
        )

        chapter_source = manga.get_xpath('//*[@class="table table-hover"]/tbody//tr//td//a/@href')
        chapter_name = manga.get_xpath('//*[@class="table table-hover"]/tbody//tr//td//a//text()')
        chapters = zip(chapter_name, chapter_source)

        manga.add_value('chapters', chapters)

        return manga.load_item()

Example #2

0

Show file

    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath('name', '//title/text()',
                        MapCompose(lambda x: x.split(' | ')[0], str.strip))
        manga.add_value('source', response.url)
        manga.add_xpath('image_src', '//*[@class="thumbnail"]/img/@src')
        manga.add_xpath('description', '//*[@class="content"]//text()',
                        MapCompose(str.strip), Join('\n'),
                        MapCompose(str.strip))
        manga.add_value(
            'total_chap',
            max([
                int(i) for i in manga.get_xpath(
                    '//*[@id="list-chapters"]/p/span/a/text()',
                    MapCompose(lambda x: re.findall(r'\d+', x)))
            ]))

        chapter_source = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/@href')
        chapter_name = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/text()')
        chapters = zip(chapter_name, chapter_source)

        manga.add_value('chapters', chapters)

        return manga.load_item()

Example #3

0

Show file

File: mangaseeonline.py Project: tudoanh/Kindlemanga

    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath('name', '//h1[@class="SeriesName"]/text()')
        manga.add_value('source', response.url)
        manga.add_xpath('image_src', '//meta[@property="og:image"]/@content')
        manga.add_xpath('description', '//*[@class="description"]/text()',
                        Join('\n'))

        if 'Complete (Publish)' in manga.get_xpath(
                '//*[@class="PublishStatus"]/text()'):
            manga.add_value('full', True)
        else:
            manga.add_value('full', False)

        chapter_xpath = '//*[@class="list chapter-list"]/a'

        manga.add_value(
            'total_chap',
            manga.get_xpath(chapter_xpath + '/span/text()',
                            MapCompose(lambda x: re.findall(r'\d+', x)))[0])

        chapter_source = manga.get_xpath(chapter_xpath + '/@href',
                                         MapCompose(make_full_url))
        chapter_name = manga.get_xpath(chapter_xpath + '/span/text()')
        chapters = zip(chapter_name, chapter_source)
        manga.add_value('chapters', chapters)
        manga.add_value('web_source', 'mangaseeonline')

        return manga.load_item()

Example #4

0

Show file

    def parse_item(self, response):
        """
        @url http://splash:8050/render.html?&url=http://www.nettruyenco.com/truyen-tranh/boyfriend-17550&wait=1
        @scrapes name source image_src total_chap description chapters web_source full
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath("unicode_name", '//h1[@class="title-detail"]/text()')
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src",
                        '//*[@class="col-xs-4 col-image"]/img/@src')
        manga.add_xpath("description",
                        '//*[@class="detail-content"]/p//text()', Join("\n"))
        chapter_xpath = '//*[@id="nt_listchapter"]/nav/ul/li[not(contains (@class, "row heading"))]/div[1]/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Hoàn thành" in manga.get_xpath(
                '//*[@class="status row"]/p[2]/text()'):
            manga.add_value("full", True)
            manga.add_value(
                "total_chap",
                manga.get_xpath(
                    chapter_xpath + "/text()",
                    MapCompose(lambda x: re.findall(r"\d+", x)),
                    MapCompose(int),
                )[0],
            )
        else:
            manga.add_value("full", False)
            manga.add_value(
                "total_chap",
                manga.get_xpath(
                    "//title/text()",
                    MapCompose(
                        lambda x: re.findall(r" Chapter \d+| Chap \d+", x)),
                    MapCompose(lambda x: re.findall(r"\d+", x)),
                    MapCompose(float),
                    MapCompose(int),
                    TakeFirst(),
                ),
            )

        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "nettruyen")
        print(manga.load_item())

        return manga.load_item()

Example #5

0

Show file

    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath(
            "name", "//title/text()", MapCompose(lambda x: x.split(" | ")[0], str.strip)
        )
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//*[@class="thumbnail"]/img/@src')
        manga.add_xpath(
            "description",
            '//*[@class="content"]//text()',
            MapCompose(str.strip),
            Join("\n"),
            MapCompose(str.strip),
        )
        manga.add_value(
            "total_chap",
            max(
                [
                    int(i)
                    for i in manga.get_xpath(
                        '//*[@id="list-chapters"]/p/span/a/text()',
                        MapCompose(lambda x: re.findall(r"\d+", x)),
                    )
                ]
            ),
        )

        get_chapter_source = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/@href', MapCompose(mc)
        )
        chapter_source = [
            chap for chap in get_chapter_source if "mediafire" not in chap
        ]
        chapter_name = manga.get_xpath('//*[@id="list-chapters"]/p/span/a/text()')
        chapters = zip(chapter_name, chapter_source)

        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "blogtruyen")

        if "Đã hoàn thành" in manga.get_xpath('//*[@class="description"]//text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        return manga.load_item()

Example #6

0

Show file

File: doctruyen3q.py Project: KindleManga/Kindlemanga

    def parse_item(self, response):
        """
        @url https://doctruyen3q.info/truyen-tranh/dao-hai-tac/77
        @scrapes name source image_src total_chap description chapters web_source full unicode_name
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        category = manga.get_xpath("//*[@class='category row']/p[2]//text()")
        categories = re.sub(r'\s+', '', "".join(category))
        if any(i in unidecode(categories).lower() for i in ["18+", "smut", "yaoi", "ntr", "yuri", 'adult', 'dammy']):
            return
        manga.add_xpath("unicode_name", '//h1[@class="title-manga"]/text()')
        manga.add_value("name", unidecode(
            manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath(
            "image_src", '//*[@class="image-comic"]/@src')
        manga.add_xpath(
            "description", '//*[@class="detail-summary"]/text()'
        )
        chapter_xpath = '//*[@id="list-chapter-dt"]/nav/ul/li/div[1]/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Đã hoàn thành" in manga.get_xpath('//*[@class="status row"]//text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        manga.add_value(
            "total_chap",
            manga.get_xpath(
                '//*[@id="list-chapter-dt"]/nav/ul/li[1]/div[1]/a/text()',
                MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)),
                MapCompose(float),
                MapCompose(int),
                TakeFirst(),
            ),
        )

        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "doctruyen3q")
        print(manga.load_item())

        return manga.load_item()

Example #7

0

Show file

File: vlogtruyen.py Project: KindleManga/Kindlemanga

    def parse_item(self, response):
        """
        @url http://splash:8050/render.html?&url=https://vlogtruyen.net/bokutachi-wa-hanshoku-wo-yameta.html&wait=1
        @scrapes name unicode_name source image_src total_chap description chapters web_source full
        """

        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath("unicode_name",
                        '//h1[@class="title-commic-detail"]/text()')
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//meta[@property="og:image"]/@content')
        manga.add_xpath("description",
                        '//*[@class="desc-commic-detail"]/text()', Join("\n"))
        chapter_xpath = '//*[@class="ul-list-chaper-detail-commic"]/li/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/h3/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Đã hoàn thành" in manga.get_xpath(
                '//*[@class="manga-status"]/p/text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        manga.add_value(
            "total_chap",
            manga.get_xpath(
                '//*[@class="ul-list-chaper-detail-commic"]/li[1]/a/h3/text()',
                MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)),
                TakeFirst(),
            ),
        )
        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "vlogtruyen")

        return manga.load_item()

Example #8

0

Show file

    def parse_item(self, response):
        """
        @url http://hocvientruyentranh.com/manga/2/shokugeki-no-souma-
        @returns items 1
        @scrapes name source total_chap chapters description
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath("name", '//h3[@class="__name"]/text()', MapCompose(str.strip))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//*[@class="__image"]/img/@src')
        manga.add_xpath(
            "description", '//*[@class="__description"]//p/text()', Join("\n")
        )
        manga.add_value(
            "total_chap",
            max(
                [
                    int(i)
                    for i in manga.get_xpath(
                        '//*[@class="table table-hover"]/tbody//tr//td//a//text()',
                        MapCompose(lambda x: re.findall(r"\d+", x)),
                    )
                ]
            ),
        )

        chapter_source = manga.get_xpath(
            '//*[@class="table table-hover"]/tbody//tr//td//a/@href'
        )
        chapter_name = manga.get_xpath(
            '//*[@class="table table-hover"]/tbody//tr//td//a//text()'
        )
        chapters = zip(chapter_name, chapter_source)

        manga.add_value("chapters", chapters)

        return manga.load_item()

Example #9

0

Show file

File: blogtruyen.py Project: tudoanh/Kindlemanga

    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath('name', '//title/text()',
                        MapCompose(lambda x: x.split(' | ')[0], str.strip))
        manga.add_value('source', response.url)
        manga.add_xpath('image_src', '//*[@class="thumbnail"]/img/@src')
        manga.add_xpath('description', '//*[@class="content"]//text()',
                        MapCompose(str.strip), Join('\n'),
                        MapCompose(str.strip))
        manga.add_value(
            'total_chap',
            max([
                int(i) for i in manga.get_xpath(
                    '//*[@id="list-chapters"]/p/span/a/text()',
                    MapCompose(lambda x: re.findall(r'\d+', x)))
            ]))

        get_chapter_source = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/@href', MapCompose(mc))
        chapter_source = [
            chap for chap in get_chapter_source if 'mediafire' not in chap
        ]
        chapter_name = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/text()')
        chapters = zip(chapter_name, chapter_source)

        manga.add_value('chapters', chapters)
        manga.add_value('web_source', 'blogtruyen')

        if 'Đã hoàn thành' in manga.get_xpath(
                '//*[@class="description"]//text()'):
            manga.add_value('full', True)
        else:
            manga.add_value('full', False)

        return manga.load_item()

Example #10

0

Show file

File: mangaseeonline.py Project: KindleManga/Kindlemanga

    def parse_item(self, response):
        """
        @url https://mangasee123.com/manga/Kingdom
        @scrapes name source image_src total_chap description chapters web_source full
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath(
            "unicode_name",
            "//div[@class='container MainContainer']//li[1]/h1/text()")
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//meta[@property="og:image"]/@content')
        manga.add_xpath("description", "//div[@class='top-5 Content']/text()",
                        Join("\n"))

        if "Complete (Publish)" in manga.get_xpath(
                '//*[@class="PublishStatus"]/text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        rss = manga.get_xpath("//a[normalize-space()='RSS Feed']/@href")
        rss_url = BASE_URL + rss[0]

        feed = feedparser.parse(rss_url, agent="Mozilla/5.0")

        manga.add_value(
            "total_chap",
            re.findall(r"\d+", feed['entries'][0]['title'])[0],
        )

        chapters = [(i['title'], i['link']) for i in feed['entries']]
        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "mangaseeonline")

        return manga.load_item()

Example #11

0

Show file

    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath('name', '//h1[@class="title-detail"]/text()')
        manga.add_value('source', response.url)
        manga.add_xpath('image_src',
                        '//*[@class="col-xs-4 col-image"]/img/@src')
        manga.add_xpath('description',
                        '//*[@class="detail-content"]/p//text()', Join('\n'))
        chapter_xpath = '//*[@id="nt_listchapter"]/nav/ul/li[not(contains (@class, "row heading"))]/div[1]/a'
        chapter_source = manga.get_xpath(chapter_xpath + '/@href')
        chapter_name = manga.get_xpath(chapter_xpath + '/text()')
        chapters = zip(chapter_name, chapter_source)

        if 'Hoàn thành' in manga.get_xpath(
                '//*[@class="status row"]/p[2]/text()'):
            manga.add_value('full', True)
            manga.add_value(
                'total_chap',
                manga.get_xpath(chapter_xpath + '/text()',
                                MapCompose(lambda x: re.findall(r'\d+', x)),
                                MapCompose(int))[0])
        else:
            manga.add_value('full', False)
            manga.add_value(
                'total_chap',
                manga.get_xpath(
                    '//title/text()',
                    MapCompose(
                        lambda x: re.findall(r' Chapter \d+| Chap \d+', x)),
                    MapCompose(lambda x: re.findall(r'\d+', x)),
                    MapCompose(int), TakeFirst()))

        manga.add_value('chapters', chapters)
        manga.add_value('web_source', 'nettruyen')

        return manga.load_item()