def _parse_issues(self, response): """Generate the list of issues for a manga @url https://mangadex.org/manga/39/one-piece @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres rank rank_order description image_urls @scrapes issues """ manga = response.meta['manga'] chapters = response.meta['chapters'] xp = '//tr[contains(@id,"chapter_")]' lines = response.xpath(xp) for line in lines: issue = Issue() # Name xp = './/a/text()' issue['name'] = line.xpath(xp).extract_first() # Number xp = './/a/@data-chapter-num' issue['number'] = line.xpath(xp).extract() # Order issue['order'] = chapters - len(manga['issues']) # Language xp = './/img/@title' issue['language'] = line.xpath(xp).extract() # Release xp = './/time/@datetime' issue['release'] = line.xpath(xp).extract() # URL xp = './/a/@href' url = line.xpath(xp).extract_first() issue['url'] = response.urljoin(url) manga['issues'].append(issue) # Next page xp = '//ul[@class="pagination"]/li[@class="active"]' \ '/following-sibling::li[@class="paging"]/a/@href' next_url = response.xpath(xp).extract_first() if next_url: meta = { 'manga': manga, 'chapters': chapters, } return response.follow(next_url, self._parse_issues, meta=meta) else: return manga
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://kissmanga.com/Manga/Naruto @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres rank rank_order description image_urls @scrapes issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//div[@class="barContent"]//a[@class="bigChar"]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//span[@class="info" and contains(text(), "%s")]' \ '/following-sibling::a/text()' manga['alt_name'] = response.xpath(xp % 'Other name:').extract() # Author manga['author'] = response.xpath(xp % 'Author:').extract() # Artist manga['artist'] = manga['author'] # Reading direction manga['reading_direction'] = 'RL' # Genres manga['genres'] = response.xpath(xp % 'Genres:').extract() # Status xp = '//span[@class="info" and contains(text(), "%s")]' \ '/following-sibling::text()[1]' manga['status'] = response.xpath(xp % 'Status:').extract() # Rank manga['rank'] = response.xpath(xp % 'Views:').re(r'(\d+).') # Rank order manga['rank_order'] = 'DESC' # Description xp = '//p[span[@class="info" and contains(text(), "%s")]]'\ '/following-sibling::p[1]/text()' manga['description'] = response.xpath(xp % 'Summary:').extract() # Cover image xp = '//div[@id="rightside"]//img/@src' url = response.xpath(xp).extract_first() manga['image_urls'] = [response.urljoin(url)] # Parse the manga issues list manga['issues'] = [] xp = '//table[@class="listing"]/tr[td]' lines = response.xpath(xp) for line in lines: issue = Issue(language='EN') # Name xp = './/a/text()' issue['name'] = line.xpath(xp).extract() # Number # Some examples that this regex needs to address # 1/11 Vol.003 Ch.009.006: Omake 004-koma # 21st Century Boys 014 # Mob Psycho 100 Ch.099.001: Mob # Mob Psycho 100 Ch.098.002 # Fantastic World Vol.001 Ch.002 # Black Clover 118 - Mage X # Black Clover 099: Family xp = './/a/text()' number = line.xpath(xp).re( r'(?:[Cc]h.|[Ee]p.|[Cc]haper|[Pp]art.)(\d[.\d]+)' r'|(\d[.\d]+)[ :-]+' r'|(\d[.\d]+)$') issue['number'] = number # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = './td[2]/text()' issue['release'] = line.xpath(xp).re(r'\d{1,2}/\d{1,2}/\d{4}') # URL xp = './/a/@href' url = line.xpath(xp).extract_first() issue['url'] = response.urljoin(url) manga['issues'].append(issue) return manga
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://bato.to @returns items 1 100 @returns request 0 1 @scrapes url name issues """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() xp = './/tr[contains(@class, "row")]' last_row, manga = None, None for update in response.xpath(xp): row = update.xpath('@class').extract()[0].split()[0] if row != last_row: if manga: yield manga manga = Manga(issues=[]) # Name xp = 'td[2]/a[2]/text()' manga['name'] = update.xpath(xp).extract() # URL xp = 'td[2]/a[2]/@href' manga['url'] = update.xpath(xp).extract() else: issue = Issue() # Name xp = './/td/a[img/@style="vertical-align:middle;"]/text()' issue['name'] = update.xpath(xp).extract() # Number issue['number'] = update.xpath(xp).re( r'Ch.(?:Story )?([.\d]+)') # Order # This is only an estimation for now issue['order'] = issue['number'] # Language xp = './/td/div/@title' issue['language'] = update.xpath(xp).extract() # Release xp = './/td[last()]/text()' issue['release'] = update.xpath(xp).extract() # URL xp = './/td/a[img/@style="vertical-align:middle;"]/@href' url = update.xpath(xp).extract()[0] issue['url'] = response.urljoin(url) # Check if is a new update update_date = convert_to_date(issue['release'][0].strip()) if update_date < until: return manga['issues'].append(issue) last_row = row # Return the last manga if manga: yield manga # Next page xp = '//a[@title="Older Releases"]/@href' next_url = response.xpath(xp).extract() if next_url: next_url = response.urljoin(next_url[0]) meta = {'until': until} yield scrapy.Request(next_url, self.parse_latest, meta=meta)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://www.mangahere.cc/manga/angel_densetsu/ @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres rank rank_order description image_urls @scrapes issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # MangaHere returns 200 for 404 pages xp = '//div[@class="error_404"]' if response.xpath(xp).extract(): return # Check if manga is licensed xp = '//div[@class="detail_list"]/div[@class="mt10 color_ff00 mb10"]' if response.xpath(xp).extract(): return # URL manga['url'] = response.url # Name xp = '//meta[@property="og:title"]/@content' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//li[label[contains(text(),"%s")]]/text()' manga['alt_name'] = response.xpath( xp % 'Alternative Name:').re(r'([^;]+)') # Author xp = '//li[label[contains(text(),"%s")]]/a/text()' manga['author'] = response.xpath(xp % 'Author(s):').extract() # Artist manga['artist'] = response.xpath(xp % 'Artist(s):').extract() # Reading direction manga['reading_direction'] = 'RL' # Status xp = '//li[label[contains(text(),"%s")]]/text()' manga['status'] = response.xpath(xp % 'Status:').extract_first() # Genres manga['genres'] = response.xpath(xp % 'Genre(s):').re(r'([^,]+)') # Rank manga['rank'] = response.xpath(xp % 'Rank:').extract() # Rank order manga['rank_order'] = 'ASC' # Description xp = '//li[label[contains(text(),"%s")]]/p[@id="show"]/text()' manga['description'] = response.xpath(xp % 'Summary:').extract() # Cover image xp = '//img[@class="img"]/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//div[@class="detail_list"]/ul[not(@class)]/li' lines = response.xpath(xp) # Check if the lines are empty if len(lines) == 1 and 'No Manga Chapter' in lines[0].extract(): return for line in lines: issue = Issue(language='EN') # Name xp = './/a/text()' name_1 = line.xpath(xp).extract() xp = './/span[@class="mr6"]/text()' name_2 = line.xpath(xp).extract() xp = './/span[@class="left"]/text()' name_3 = line.xpath(xp).extract() issue['name'] = name_1 + name_2 + name_3 # Number xp = './/a/text()' issue['number'] = line.xpath(xp).re( r'([.\d]+)\s*$') # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = './/span[@class="right"]/text()' issue['release'] = line.xpath(xp).extract() # URL xp = './/a/@href' url = line.xpath(xp).extract_first() issue['url'] = response.urljoin(url) manga['issues'].append(issue) return manga
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://www.mangahere.co/latest/ @returns items 1 100 @returns request 0 1 @scrapes url name issues """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() xp = '//div[@class="manga_updates"]/dl' for update in response.xpath(xp): # Check if is a new update xp = './/span[@class="time"]/text()' update_date = update.xpath(xp).extract() update_date = convert_to_date(update_date[0]) if update_date < until: return manga = Manga() # Name xp = './/a[@class="manga_info"]/text()' manga['name'] = update.xpath(xp).extract() # URL xp = './/a[@class="manga_info"]/@href' url = update.xpath(xp).extract() manga['url'] = response.urljoin(url[0]) # Parse the manga issues list manga['issues'] = [] xp = './dd' for line in update.xpath(xp): issue = Issue(language='EN') # Name xp = 'a/text()' issue['name'] = line.xpath(xp).extract() # Number xp = 'a/text()' issue['number'] = line.xpath(xp).re(r'([.\d]+)\s*$') # Order # This is only an estimation for now issue['order'] = issue['number'] # Release issue['release'] = update_date # URL xp = 'a/@href' url = line.xpath(xp).extract() issue['url'] = response.urljoin(url[0]) manga['issues'].append(issue) yield manga # Next page xp = '//a[@class="next"]/@href' next_url = response.xpath(xp).extract() if next_url: next_url = response.urljoin(next_url[0]) meta = {'until': until} yield scrapy.Request(next_url, self.parse_latest, meta=meta)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://bato.to/comic/_/comics/angel-densetsu-r460 @returns items 1 1 @returns request 0 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL # Batoto can have different URLs for comics (_/comics/, # _/sp/), so here we update the manga with the real one. manga['url'] = response.url # Name xp = '//h1[@class="ipsType_pagetitle"]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//td[contains(text(),"%s")]/following-sibling::td/*/text()' manga['alt_name'] = response.xpath(xp % 'Alt Names:').re(r'([^,;]+)') # Author manga['author'] = response.xpath(xp % 'Author:').extract() # Artist manga['artist'] = response.xpath(xp % 'Artist:').extract() # Reading direction manga['reading_direction'] = 'RL' # Status xp = '//td[contains(text(),"%s")]/following-sibling::td/text()' manga['status'] = response.xpath(xp % 'Status:').extract() # Genres xp = '//td[contains(text(),"%s")]/following-sibling::td/a/span/text()' manga['genres'] = response.xpath(xp % 'Genres:').extract() # Description xp = '//td[contains(text(),"%s")]/following-sibling::td/text()' manga['description'] = response.xpath(xp % 'Description:').extract() # Cover image xp = '//div[@class="ipsBox"]/div/div/img/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//tr[contains(@class,"chapter_row")' \ ' and not(contains(@class,"chapter_row_expand"))]' lines = response.xpath(xp) for line in lines: issue = Issue() # Name xp = './td[1]/a/text()' issue['name'] = line.xpath(xp).extract() # Number issue['number'] = line.xpath(xp).re(r'Ch.(?:Story )?([.\d]+)') # Order issue['order'] = len(lines) - len(manga['issues']) # Language xp = './td[2]/div/@title' issue['language'] = line.xpath(xp).extract() # Release xp = './td[5]/text()' issue['release'] = line.xpath(xp).extract() # URL xp = './td[1]/a/@href' url = line.xpath(xp).extract()[0] issue['url'] = response.urljoin(url) manga['issues'].append(issue) yield manga
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://submanga.org/bloody-cross @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description image_urls issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//a[@class="btn-link text-semibold text-mint"][2]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//span[contains(text(), "%s")]/following-sibling::text()' title = u'Títulos alternativos:' manga['alt_name'] = response.xpath(xp % title).re(r'([^,;]+)') # Author manga['author'] = response.xpath(xp % 'Autor:').extract() # Artist manga['artist'] = manga['author'] # Reading direction manga['reading_direction'] = 'RL' # Status (Finalizado / En curso) xp = '//span[@class="text-2x text-thin"]/text()' manga['status'] = response.xpath(xp).extract() # Genres xp = '//span[contains(text(), "Generos:")]/following-sibling::a/text()' manga['genres'] = response.xpath(xp).extract() # Description xp = '//p[@class="text-justify"]/text()' manga['description'] = response.xpath(xp).extract() # Cover image xp = '//img[@class="img-cover-m"]/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//table[@id="caps-list"]//tr' lines = response.xpath(xp) for line in lines: xp = 'td[3]/a/img/@alt' langs = line.xpath(xp).extract() for lang in langs: issue = Issue(language=lang) # Name xp = 'td[2]/a/text()' issue['name'] = line.xpath(xp).extract() # Number xp = 'td[1]/a/text()' issue['number'] = line.xpath(xp).extract_first() # Order issue['order'] = int(issue['number']) # Release xp = 'td[4]/a/span/text()' issue['release'] = line.xpath(xp).extract() # URL xp = 'td[1]/a/@href' url = line.xpath(xp).extract_first() issue['url'] = '%s/%s' % (url, lang) manga['issues'].append(issue) return manga
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://unionmangas.net/manga/bleach @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description image_urls issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//div[@class="col-md-12"]/h2/text()' manga['name'] = response.xpath(xp).extract_first() # Alternate name manga['alt_name'] = manga['name'] # Author xp = '//label[contains(text(), "%s")]/following-sibling::text()' manga['author'] = response.xpath(xp % 'Autor:').extract() # Artist manga['artist'] = response.xpath(xp % 'Artista:').extract() # Reading direction manga['reading_direction'] = 'RL' # Status (Ativo / Completo) xp = '//label[contains(text(), "Status:")]' \ '/following-sibling::span/text()' manga['status'] = response.xpath(xp).extract() # Genres xp = u'//label[contains(text(), "Gênero(s):")]' \ u'/following-sibling::a/text()' manga['genres'] = response.xpath(xp).extract() # Description xp = '//div[@class="panel-body"]/text()' manga['description'] = response.xpath(xp).extract() # Cover image xp = '//img[@class="img-thumbnail"]/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//div[@class="col-xs-6 col-md-6"]' lines = response.xpath(xp) for line in lines: issue = Issue(language='PT') # Name xp = 'a/text()' issue['name'] = line.xpath(xp).extract() # Number issue['number'] = line.xpath(xp).re(r'Cap. ([.\d]+)$') # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = 'span/text()' issue['release'] = line.xpath(xp).re(r'\d{2}/\d{2}/\d{4}') # URL xp = 'a/@href' issue['url'] = line.xpath(xp).extract() manga['issues'].append(issue) return manga
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://mangaseeonline.us/manga/Shingeki-No-Kyojin @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres rank rank_order description image_urls @scrapes issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//h1/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//div/b[contains(text(),"%s")]/following-sibling::text()' manga['alt_name'] = response.xpath(xp % 'Alternate Name(s):').re(r'([^,]+)') # Author xp = '//div/b[contains(text(),"%s")]/following-sibling::*/text()' manga['author'] = response.xpath(xp % 'Author(s):').extract() # Artist manga['artist'] = manga['author'] # Reading direction manga['reading_direction'] = response.xpath(xp % 'Type:').extract_first() # Status manga['status'] = response.xpath(xp % 'Status:').extract_first() # Genres xp = '//div/b[contains(text(),"%s")]/following-sibling::*/text()' manga['genres'] = response.xpath(xp % 'Genre(s):').extract() # Rank order manga['rank_order'] = 'DESC' # Description xp = '//div[@class="description"]/text()' manga['description'] = response.xpath(xp).extract() # Cover image xp = '//div[contains(@class,"leftImage")]/img/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//a[@class="list-group-item"]' lines = response.xpath(xp) for line in lines: issue = Issue(language='EN') # Name xp = './span/text()' issue['name'] = line.xpath(xp).extract() # Number xp = './@chapter' issue['number'] = line.xpath(xp).extract() # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = './time/@datetime' issue['release'] = line.xpath(xp).extract() # URL xp = './@href' url = line.xpath(xp).extract_first() issue['url'] = response.urljoin(url) manga['issues'].append(issue) # Rank url = response.urljoin('subscribe.button.php') xp = '//input[@class="IndexName"]/@value' index_name = response.xpath(xp).extract_first() form_data = {'IndexName': index_name} meta = {'manga': manga} return scrapy.FormRequest(url, self._parse_subscribe, formdata=form_data, meta=meta)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://kissmanga.com/Manga/Naruto @returns items 1 1 @returns request 0 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # Name xp = '//div[@class="barContent"]//a[@class="bigChar"]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//span[@class="info" and contains(text(), "%s")]' \ '/following-sibling::a/text()' manga['alt_name'] = response.xpath(xp % 'Other name:').extract() # Author manga['author'] = response.xpath(xp % 'Author:').extract() # Artist manga['artist'] = manga['author'] # Genres manga['genres'] = response.xpath(xp % 'Genres:').extract() # Reading direction manga['reading_direction'] = 'RL' # Status xp = '//span[@class="info" and contains(text(), "%s")]' \ '/following-sibling::text()[1]' manga['status'] = response.xpath(xp % 'Status:').extract() # Rank manga['rank'] = response.xpath(xp % 'Views:').re(r'(\d+).') manga['rank_order'] = 'DESC' # Description xp = '//p[span[@class="info" and contains(text(), "%s")]]'\ '/following-sibling::p[1]/text()' manga['description'] = response.xpath(xp % 'Summary:').extract() # Cover image xp = '//div[@id="rightside"]//img/@src' url = response.xpath(xp).extract() manga['image_urls'] = [response.urljoin(url[0])] # Parse the manga issues list manga['issues'] = [] xp = '//table[@class="listing"]/tr[td]' lines = response.xpath(xp) for line in lines: issue = Issue(language='EN') # Name xp = './/a/text()' issue['name'] = line.xpath(xp).extract() # Number xp = './/a/text()' number = line.xpath(xp).re( r'(?:[Vv]ol.[.\d]+)?\s*' r'(?:[Cc]h.|[Ee]p.|[Cc]haper|[Pp]art.)?(\d[.\d]+)') issue['number'] = number[0] if len(number) > 1 else number # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = './td[2]/text()' issue['release'] = line.xpath(xp).re(r'\d{1,2}/\d{1,2}/\d{4}') # URL xp = './/a/@href' url = line.xpath(xp).extract() issue['url'] = response.urljoin(url[0]) manga['issues'].append(issue) yield manga
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url https://www.mangareader.net/178/angel-densetsu.html @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description image_urls issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//h2[@class="aname"]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//td[contains(text(),"%s")]/following-sibling::td/text()' _alt_name = response.xpath(xp % 'Alternate Name:') manga['alt_name'] = _alt_name.re(r'([^,;]+)') # Author manga['author'] = response.xpath(xp % 'Author:').extract() # Artist manga['artist'] = response.xpath(xp % 'Artist:').extract() # Reading direction rd = response.xpath(xp % 'Reading Direction:').extract_first() manga['reading_direction'] = ('RL' if rd == 'Right to Left' else 'LR') # Status manga['status'] = response.xpath(xp % 'Status:').extract() # Genres xp = '//span[@class="genretags"]/text()' manga['genres'] = response.xpath(xp).extract() # Description xp = '//div[@id="readmangasum"]/p/text()' manga['description'] = '\n'.join(response.xpath(xp).extract()) # Cover image xp = '//div[@id="mangaimg"]/img/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//table[@id="listing"]/tr[td]' for line in response.xpath(xp): issue = Issue(language='EN') # Name xp = './/a/text()' name_1 = line.xpath(xp).extract() xp = './/a/following-sibling::text()' name_2 = line.xpath(xp).extract() issue['name'] = name_1 + name_2 # Number xp = './/a/text()' issue['number'] = line.xpath(xp).re(r'([.\d]+)$') # Order issue['order'] = len(manga['issues']) # Release xp = './td[2]/text()' issue['release'] = line.xpath(xp).extract() # URL xp = './/a/@href' url = line.xpath(xp).extract_first() issue['url'] = response.urljoin(url) manga['issues'].append(issue) return manga
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://www.mangareader.net/latest @returns items 1 30 @returns request 0 1 @scrapes url name issues """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() xp = '//table[@class="updates"]/tr[@class="c2"]' for update in response.xpath(xp): # Check if is a new update xp = './/td[@class="c1"]/text()' update_date = update.xpath(xp).extract() update_date = convert_to_date(update_date[0]) if update_date < until: return manga = Manga() # Name xp = './/a[@class="chapter"]/strong/text()' manga['name'] = update.xpath(xp).extract() # URL xp = './/a[@class="chapter"]/@href' url = update.xpath(xp).extract() manga['url'] = response.urljoin(url[0]) # Parse the manga issues list manga['issues'] = [] xp = './/a[@class="chaptersrec"]' for line in update.xpath(xp): issue = Issue(language='EN') # Name xp = 'text()' issue['name'] = line.xpath(xp).extract() # Number xp = 'text()' issue['number'] = line.xpath(xp).re(r'(\d+)$') # Order # This is only an estimation for now issue['order'] = issue['number'] # Release issue['release'] = update_date # URL xp = '@href' url = line.xpath(xp).extract() issue['url'] = response.urljoin(url[0]) manga['issues'].append(issue) yield manga # Next page xp = '//div[@id="latest"]/div[@id="sp"]/a[contains(., ">")]/@href' next_url = response.xpath(xp).extract() if next_url: next_url = response.urljoin(next_url[0]) meta = {'until': until} yield scrapy.Request(next_url, self.parse_latest, meta=meta)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://mangafox.me/manga/a_bias_girl/ @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description image_urls issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # Check if manga is licensed xp = '//div[@class="warning" and contains(text(),"has been licensed")]' if response.xpath(xp).extract(): return # URL manga['url'] = response.url # Name xp = '//title/text()' manga['name'] = response.xpath(xp).re(r'(.*) - Read') # Alternate name xp = '//div[@id="title"]/h3//text()' manga['alt_name'] = response.xpath(xp).extract() # Author xp = '//div[@id="title"]//tr[2]/td[2]/a/text()' manga['author'] = response.xpath(xp).extract() # Artist xp = '//div[@id="title"]//tr[2]/td[3]/a/text()' manga['artist'] = response.xpath(xp).extract() # Reading direction manga['reading_direction'] = 'RL' # Status xp = './/div[@class="data"][1]/span/text()' manga['status'] = response.xpath(xp).re(r'\w+') # Genres xp = '//div[@id="title"]//tr[2]/td[4]/a/text()' manga['genres'] = response.xpath(xp).extract() # Description xp = '//div[@id="title"]/p[@class="summary"]/text()' manga['description'] = response.xpath(xp).extract() # Cover image xp = '//div[@class="cover"]/img/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//ul[@class="chlist"]/li' lines = response.xpath(xp) for line in lines: issue = Issue(language='EN') # Name xp = '(.//h3|.//h4)/a/text()' name_1 = line.xpath(xp).extract() xp = '(.//h3|.//h4)/span[@class="title nowrap"]/text()' name_2 = line.xpath(xp).extract() issue['name'] = name_1 + name_2 # Number xp = '(.//h3|.//h4)/a/text()' issue['number'] = line.xpath(xp).re(r'.*?([.\d]+)$') # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = './/span[@class="date"]/text()' issue['release'] = line.xpath(xp).extract() # URL xp = './/a[@class="tips"]/@href' issue['url'] = line.xpath(xp).extract() manga['issues'].append(issue) return manga
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://mangafox.me/releases/ @returns items 1 100 @returns request 0 1 @scrapes url name issues """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() xp = '//ul[@id="updates"]/li/div' for update in response.xpath(xp): manga = Manga() # Name xp = './/h3/a/text()' manga['name'] = update.xpath(xp).extract() # URL xp = './/h3/a//@href' manga['url'] = update.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = './/dt' for line in update.xpath(xp): # Check if is a new update xp = './/em/text()' update_date = update.xpath(xp).extract() update_date = convert_to_date(update_date[0]) if update_date < until: return issue = Issue(language='EN') # Name xp = './/span//text()' issue['name'] = line.xpath(xp).extract() # Number xp = './/span/a/text()' issue['number'] = line.xpath(xp).re(r'(\d+)$') # Order # This is only an estimation for now issue['order'] = issue['number'] # Release issue['release'] = update_date # URL xp = './/span/a/@href' url = line.xpath(xp).extract() issue['url'] = urljoin(response.url, url[0]) manga['issues'].append(issue) yield manga # Next page xp = '//a[span[@class="next"]]/@href' next_url = response.xpath(xp).extract() if next_url: next_url = urljoin(response.url, next_url[0]) meta = {'until': until} yield scrapy.Request(next_url, self.parse_latest, meta=meta)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url https://bato.to/series/68329 @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres rank rank_order description image_urls @scrapes issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//h3[@class="item-title"]/a/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//div[@class="pb-2 alias-set hairlines-fade-bottom"]/text()' manga['alt_name'] = response.xpath(xp).extract_first().split('/') # Author xp = '//div[@class="attr-item"]/b[contains(text(),"%s")]' \ '/following-sibling::span/*/text()' manga['author'] = response.xpath(xp % 'Authors:').extract_first() # Artist manga['artist'] = response.xpath(xp % 'Authors:').extract()[1:] # Reading direction manga['reading_direction'] = 'RL' # Status xp = '//div[@class="attr-item"]/b[contains(text(),"%s")]' \ '/following-sibling::span/text()' manga['status'] = response.xpath(xp % 'Status:').extract() # Genres genres = response.xpath(xp % 'Genres:').extract()[-1] manga['genres'] = genres.split('/') # Rank rank = response.xpath(xp % 'Rank:').extract_first() manga['rank'] = rank.split(',')[0] # Rank order manga['rank_order'] = 'ASC' # Description xp = '//pre/text()' manga['description'] = response.xpath(xp).extract() # Cover image xp = '//img[@class="shadow-6"]/@src' url = response.xpath(xp).extract_first() manga['image_urls'] = [response.urljoin(url)] # Get language from the title flag xp = '//div[@class="mt-4 title-set"]/span/@class' language = response.xpath(xp).extract_first() language = language.split()[-1] # Parse the manga issues list manga['issues'] = [] xp = '//div[@class="main"]/div' lines = response.xpath(xp) for line in lines: issue = Issue(language=language) # Name xp = './a//text()' issue['name'] = line.xpath(xp).extract() # Number xp = './a/b/text()' issue['number'] = line.xpath(xp).re(r'Ch.(\d+)') # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = './/i/text()' issue['release'] = line.xpath(xp).extract() # URL xp = './a/@href' url = line.xpath(xp).extract_first() issue['url'] = response.urljoin(url) manga['issues'].append(issue) return manga