def parse_item(self, response): """ @url http://hocvientruyentranh.com/manga/2/shokugeki-no-souma- @returns items 1 @scrapes name source total_chap chapters """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//h3[@class="__name"]/text()', MapCompose(str.strip)) manga.add_value('source', response.url) manga.add_xpath('image_src', '//*[@class="__image"]/img/@src') manga.add_value( 'total_chap', max( [int(i) for i in manga.get_xpath( '//*[@class="table table-hover"]/tbody//tr//td//a//text()', MapCompose(lambda x: re.findall(r'\d+', x)))] ) ) chapter_source = manga.get_xpath('//*[@class="table table-hover"]/tbody//tr//td//a/@href') chapter_name = manga.get_xpath('//*[@class="table table-hover"]/tbody//tr//td//a//text()') chapters = zip(chapter_name, chapter_source) manga.add_value('chapters', chapters) return manga.load_item()
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//title/text()', MapCompose(lambda x: x.split(' | ')[0], str.strip)) manga.add_value('source', response.url) manga.add_xpath('image_src', '//*[@class="thumbnail"]/img/@src') manga.add_xpath('description', '//*[@class="content"]//text()', MapCompose(str.strip), Join('\n'), MapCompose(str.strip)) manga.add_value( 'total_chap', max([ int(i) for i in manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()', MapCompose(lambda x: re.findall(r'\d+', x))) ])) chapter_source = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/@href') chapter_name = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()') chapters = zip(chapter_name, chapter_source) manga.add_value('chapters', chapters) return manga.load_item()
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//h1[@class="SeriesName"]/text()') manga.add_value('source', response.url) manga.add_xpath('image_src', '//meta[@property="og:image"]/@content') manga.add_xpath('description', '//*[@class="description"]/text()', Join('\n')) if 'Complete (Publish)' in manga.get_xpath( '//*[@class="PublishStatus"]/text()'): manga.add_value('full', True) else: manga.add_value('full', False) chapter_xpath = '//*[@class="list chapter-list"]/a' manga.add_value( 'total_chap', manga.get_xpath(chapter_xpath + '/span/text()', MapCompose(lambda x: re.findall(r'\d+', x)))[0]) chapter_source = manga.get_xpath(chapter_xpath + '/@href', MapCompose(make_full_url)) chapter_name = manga.get_xpath(chapter_xpath + '/span/text()') chapters = zip(chapter_name, chapter_source) manga.add_value('chapters', chapters) manga.add_value('web_source', 'mangaseeonline') return manga.load_item()
def parse_item(self, response): """ @url http://splash:8050/render.html?&url=http://www.nettruyenco.com/truyen-tranh/boyfriend-17550&wait=1 @scrapes name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("unicode_name", '//h1[@class="title-detail"]/text()') manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//*[@class="col-xs-4 col-image"]/img/@src') manga.add_xpath("description", '//*[@class="detail-content"]/p//text()', Join("\n")) chapter_xpath = '//*[@id="nt_listchapter"]/nav/ul/li[not(contains (@class, "row heading"))]/div[1]/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/text()") chapters = zip(chapter_name, chapter_source) if "Hoàn thành" in manga.get_xpath( '//*[@class="status row"]/p[2]/text()'): manga.add_value("full", True) manga.add_value( "total_chap", manga.get_xpath( chapter_xpath + "/text()", MapCompose(lambda x: re.findall(r"\d+", x)), MapCompose(int), )[0], ) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( "//title/text()", MapCompose( lambda x: re.findall(r" Chapter \d+| Chap \d+", x)), MapCompose(lambda x: re.findall(r"\d+", x)), MapCompose(float), MapCompose(int), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "nettruyen") print(manga.load_item()) return manga.load_item()
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath( "name", "//title/text()", MapCompose(lambda x: x.split(" | ")[0], str.strip) ) manga.add_value("source", response.url) manga.add_xpath("image_src", '//*[@class="thumbnail"]/img/@src') manga.add_xpath( "description", '//*[@class="content"]//text()', MapCompose(str.strip), Join("\n"), MapCompose(str.strip), ) manga.add_value( "total_chap", max( [ int(i) for i in manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()', MapCompose(lambda x: re.findall(r"\d+", x)), ) ] ), ) get_chapter_source = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/@href', MapCompose(mc) ) chapter_source = [ chap for chap in get_chapter_source if "mediafire" not in chap ] chapter_name = manga.get_xpath('//*[@id="list-chapters"]/p/span/a/text()') chapters = zip(chapter_name, chapter_source) manga.add_value("chapters", chapters) manga.add_value("web_source", "blogtruyen") if "Đã hoàn thành" in manga.get_xpath('//*[@class="description"]//text()'): manga.add_value("full", True) else: manga.add_value("full", False) return manga.load_item()
def parse_item(self, response): """ @url https://doctruyen3q.info/truyen-tranh/dao-hai-tac/77 @scrapes name source image_src total_chap description chapters web_source full unicode_name """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) category = manga.get_xpath("//*[@class='category row']/p[2]//text()") categories = re.sub(r'\s+', '', "".join(category)) if any(i in unidecode(categories).lower() for i in ["18+", "smut", "yaoi", "ntr", "yuri", 'adult', 'dammy']): return manga.add_xpath("unicode_name", '//h1[@class="title-manga"]/text()') manga.add_value("name", unidecode( manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath( "image_src", '//*[@class="image-comic"]/@src') manga.add_xpath( "description", '//*[@class="detail-summary"]/text()' ) chapter_xpath = '//*[@id="list-chapter-dt"]/nav/ul/li/div[1]/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/text()") chapters = zip(chapter_name, chapter_source) if "Đã hoàn thành" in manga.get_xpath('//*[@class="status row"]//text()'): manga.add_value("full", True) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( '//*[@id="list-chapter-dt"]/nav/ul/li[1]/div[1]/a/text()', MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)), MapCompose(float), MapCompose(int), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "doctruyen3q") print(manga.load_item()) return manga.load_item()
def parse_item(self, response): """ @url http://splash:8050/render.html?&url=https://vlogtruyen.net/bokutachi-wa-hanshoku-wo-yameta.html&wait=1 @scrapes name unicode_name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("unicode_name", '//h1[@class="title-commic-detail"]/text()') manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//meta[@property="og:image"]/@content') manga.add_xpath("description", '//*[@class="desc-commic-detail"]/text()', Join("\n")) chapter_xpath = '//*[@class="ul-list-chaper-detail-commic"]/li/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/h3/text()") chapters = zip(chapter_name, chapter_source) if "Đã hoàn thành" in manga.get_xpath( '//*[@class="manga-status"]/p/text()'): manga.add_value("full", True) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( '//*[@class="ul-list-chaper-detail-commic"]/li[1]/a/h3/text()', MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "vlogtruyen") return manga.load_item()
def parse_item(self, response): """ @url http://hocvientruyentranh.com/manga/2/shokugeki-no-souma- @returns items 1 @scrapes name source total_chap chapters description """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("name", '//h3[@class="__name"]/text()', MapCompose(str.strip)) manga.add_value("source", response.url) manga.add_xpath("image_src", '//*[@class="__image"]/img/@src') manga.add_xpath( "description", '//*[@class="__description"]//p/text()', Join("\n") ) manga.add_value( "total_chap", max( [ int(i) for i in manga.get_xpath( '//*[@class="table table-hover"]/tbody//tr//td//a//text()', MapCompose(lambda x: re.findall(r"\d+", x)), ) ] ), ) chapter_source = manga.get_xpath( '//*[@class="table table-hover"]/tbody//tr//td//a/@href' ) chapter_name = manga.get_xpath( '//*[@class="table table-hover"]/tbody//tr//td//a//text()' ) chapters = zip(chapter_name, chapter_source) manga.add_value("chapters", chapters) return manga.load_item()
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//title/text()', MapCompose(lambda x: x.split(' | ')[0], str.strip)) manga.add_value('source', response.url) manga.add_xpath('image_src', '//*[@class="thumbnail"]/img/@src') manga.add_xpath('description', '//*[@class="content"]//text()', MapCompose(str.strip), Join('\n'), MapCompose(str.strip)) manga.add_value( 'total_chap', max([ int(i) for i in manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()', MapCompose(lambda x: re.findall(r'\d+', x))) ])) get_chapter_source = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/@href', MapCompose(mc)) chapter_source = [ chap for chap in get_chapter_source if 'mediafire' not in chap ] chapter_name = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()') chapters = zip(chapter_name, chapter_source) manga.add_value('chapters', chapters) manga.add_value('web_source', 'blogtruyen') if 'Đã hoàn thành' in manga.get_xpath( '//*[@class="description"]//text()'): manga.add_value('full', True) else: manga.add_value('full', False) return manga.load_item()
def parse_item(self, response): """ @url https://mangasee123.com/manga/Kingdom @scrapes name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath( "unicode_name", "//div[@class='container MainContainer']//li[1]/h1/text()") manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//meta[@property="og:image"]/@content') manga.add_xpath("description", "//div[@class='top-5 Content']/text()", Join("\n")) if "Complete (Publish)" in manga.get_xpath( '//*[@class="PublishStatus"]/text()'): manga.add_value("full", True) else: manga.add_value("full", False) rss = manga.get_xpath("//a[normalize-space()='RSS Feed']/@href") rss_url = BASE_URL + rss[0] feed = feedparser.parse(rss_url, agent="Mozilla/5.0") manga.add_value( "total_chap", re.findall(r"\d+", feed['entries'][0]['title'])[0], ) chapters = [(i['title'], i['link']) for i in feed['entries']] manga.add_value("chapters", chapters) manga.add_value("web_source", "mangaseeonline") return manga.load_item()
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//h1[@class="title-detail"]/text()') manga.add_value('source', response.url) manga.add_xpath('image_src', '//*[@class="col-xs-4 col-image"]/img/@src') manga.add_xpath('description', '//*[@class="detail-content"]/p//text()', Join('\n')) chapter_xpath = '//*[@id="nt_listchapter"]/nav/ul/li[not(contains (@class, "row heading"))]/div[1]/a' chapter_source = manga.get_xpath(chapter_xpath + '/@href') chapter_name = manga.get_xpath(chapter_xpath + '/text()') chapters = zip(chapter_name, chapter_source) if 'Hoàn thành' in manga.get_xpath( '//*[@class="status row"]/p[2]/text()'): manga.add_value('full', True) manga.add_value( 'total_chap', manga.get_xpath(chapter_xpath + '/text()', MapCompose(lambda x: re.findall(r'\d+', x)), MapCompose(int))[0]) else: manga.add_value('full', False) manga.add_value( 'total_chap', manga.get_xpath( '//title/text()', MapCompose( lambda x: re.findall(r' Chapter \d+| Chap \d+', x)), MapCompose(lambda x: re.findall(r'\d+', x)), MapCompose(int), TakeFirst())) manga.add_value('chapters', chapters) manga.add_value('web_source', 'nettruyen') return manga.load_item()