def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://kissmanga.com/ @returns items 0 0 @returns request 5 10 @scrapes url name issues """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() # XXX TODO - we ignore the `until` date, and make a full parse # of the initial scroll panel (that contain old entries) xp = '//div[@class="items"]/div' for update in response.xpath(xp): manga = Manga() # Name xp = './/a/text()' manga['name'] = update.xpath(xp).extract() # URL xp = './/a/@href' url = update.xpath(xp).extract() manga['url'] = response.urljoin(url[0]) # Parse the manga issues list request = scrapy.Request(manga['url'], self.parse_collection, meta={'manga': manga}) yield request
def _parse_latest(self, response): xp = '//a[@class="list-link"]/@href' url = response.xpath(xp).extract_first() url = response.urljoin(url) manga = Manga(url=url) meta = {'manga': manga} return response.follow(url, self.parse_collection, meta=meta)
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://submanga.org @returns items 0 0 @returns request 5 10 @scrapes url name issues """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() xp = '//div[@class="timeline-entry"]' for update in response.xpath(xp): manga = Manga() # Name xp = './/span[@class="text-mint"]/text()' manga['name'] = update.xpath(xp).extract() # URL xp = './/a/@href' manga['url'] = update.xpath(xp).extract() # There is not link to the issue, nor date. We parse the # full manga. meta = {'manga': manga} request = scrapy.Request(manga['url'][0], self.parse_collection, meta=meta) yield request
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url https://www.mangareader.net/popular/3660 @returns items 0 @returns request 30 40 """ xp = '//div[@class="mangaresultitem"]' for item in response.xpath(xp): manga = Manga() # URL xp = './/div[@class="manga_name"]//a/@href' manga['url'] = response.urljoin(item.xpath(xp).extract_first()) # Rank xp = './/div[@class="c1"]/text()' manga['rank'] = item.xpath(xp).re(r'(\d+).') # Rank order manga['rank_order'] = 'ASC' meta = {'manga': manga} yield response.follow(manga['url'], self.parse_collection, meta=meta) # Next page xp = '//div[@id="sp"]/a[contains(., ">")]/@href' next_url = response.xpath(xp).extract_first() if next_url: yield response.follow(next_url, self.parse_catalog)
def parse_latest(self, response, until=None): """Generate the list of new manga until a date @url http://mangaseeonline.us/ @returns items 0 @returns request 25 100 """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() # Get all manga's URL from the same page and update it via # `parse_collection` xp = '//a[@class="latestSeries"]/@href' for url in response.xpath(xp).extract(): url = response.urljoin(url) manga = Manga(url=url) meta = {'manga': manga} request = scrapy.Request(url, self.parse_collection, meta=meta) yield request # Check the oldest update date xp = '//time[@class="timeago"]/@datetime' update_date = response.xpath(xp).extract()[-1].strip() update_date = convert_to_date(update_date) if update_date < until: return
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url http://kissmanga.com/MangaList?page=200 @returns items 0 @returns request 25 60 """ xp = '//table[@class="listing"]/tr/td[1]' for item in response.xpath(xp): manga = Manga() # URL xp = 'a/@href' manga['url'] = response.urljoin(item.xpath(xp).extract_first()) meta = {'manga': manga} request = scrapy.Request(manga['url'], self.parse_collection, meta=meta) yield request # Next page xp = '//ul[@class="pager"]/li/a[contains(., "Next")]/@href' next_url = response.xpath(xp).extract_first() if next_url: next_url = response.urljoin(next_url) yield scrapy.Request(next_url, self.parse_catalog)
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url http://unionmangas.net/mangas/a-z/10 @returns items 0 @returns request 1 50 """ xp = '//div[contains(@class, "bloco-manga")]' for item in response.xpath(xp): manga = Manga() # URL xp = 'a[2]/@href' manga['url'] = item.xpath(xp).extract_first() # Rank xp = 'div[@style="display: none"]/text()' manga['rank'] = item.xpath(xp).re(r'([\d.]+) views') # Rank order manga['rank_order'] = 'DESC' meta = {'manga': manga} request = scrapy.Request(manga['url'], self.parse_collection, meta=meta) yield request # Next page xp = '//ul[@class="pagination"]/li/a[contains(., "Next")]/@href' next_url = response.xpath(xp).extract_first() if next_url: next_url = response.urljoin(next_url) yield scrapy.Request(next_url, self.parse_catalog)
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url http://mangafox.me/directory/ @returns items 0 @returns request 30 45 """ xp = '//ul[@class="list"]/li' for item in response.xpath(xp): manga = Manga() # URL xp = './/a[@class="title"]/@href' manga['url'] = item.xpath(xp).extract_first() # Rank xp = './/p[@class="info"]/label/text()' manga['rank'] = item.xpath(xp).re('(\d+)') # Rank order manga['rank_order'] = 'ASC' meta = {'manga': manga} request = scrapy.Request(manga['url'], self.parse_collection, meta=meta) yield request # Next page xp = '//a[span[@class="next"]]/@href' next_url = response.xpath(xp).extract_first() if next_url: next_url = urljoin(response.url, next_url) yield scrapy.Request(next_url, self.parse_catalog)
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url http://submanga.org/mangas @returns items 0 @returns request 1 37 """ # Get the page number from the URL qs = urlparse.urlparse(response.url).query qs = urlparse.parse_qs(qs) page = int(qs['page'][0]) if 'page' in qs else 0 xp = '//div[contains(@class, "item_manga")]' for order, item in enumerate(response.xpath(xp)): manga = Manga() # URL xp = 'a/@href' manga['url'] = item.xpath(xp).extract_first() # Rank manga['rank'] = page * 6 * 6 + order + 1 # Rank order manga['rank_order'] = 'ASC' meta = {'manga': manga} request = scrapy.Request(manga['url'], self.parse_collection, meta=meta) yield request # Next page xp = '//ul[@class="pagination"]/li/a[@rel="next"]/@href' next_url = response.xpath(xp).extract_first() if next_url: next_url = response.urljoin(next_url) yield scrapy.Request(next_url, self.parse_catalog)
def _parse_subscribe(self, response): if 'manga' in response.meta: manga = response.meta['manga'] else: # This is not correct at all, but we can use this to allow # the testing for this contract manga = Manga(url=response.url) xp = '//span[@id="numSubscribe"]/@alt' manga['rank'] = response.xpath(xp).extract_first() return manga
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url http://bato.to/search_ajax?p=200 @returns items 0 @returns request 30 40 """ def mb(x, default=0.0): return x[0] if x else default xp = '//tr[not(@class) and not(@id)]' for item in response.xpath(xp): manga = Manga() # URL xp = './td[1]/strong/a/@href' manga['url'] = item.xpath(xp).extract_first() # Rank # In Batoto there is not rank, but a combination of # rating, viewers and followers. xp = './td[3]/div/@title' rating = float(mb(item.xpath(xp).re(r'([.\d]+)/5'))) xp = './td[4]/text()' viewers = convert_to_number(item.xpath(xp).extract_first()) xp = './td[5]/text()' followers = convert_to_number(item.xpath(xp).extract_first()) manga['rank'] = (rating + 0.1) * viewers * followers # Rank order manga['rank_order'] = 'DESC' # URL Hack to avoid a redirection. This is used because # the download_delay is also added to the redirector. # # This makes the spider a bit faster, but we still needs # to update the real URL in the `parse_collection` side. url = manga['url'].split('_/')[-1] url = 'http://bato.to/comic/_/comics/%s' % url # Also use this URL in the Item to avoid duplicates. manga['url'] = url meta = {'manga': manga} request = scrapy.Request(url, self.parse_collection, meta=meta) yield request # Next page xp = '//tr[@id="show_more_row"]/td/input/@onclick' next_page_number = response.xpath(xp).re(r'.*, (\d+)\)') if next_page_number: next_page_number = int(next_page_number[0]) + 1 next_url = AJAX_SEARCH % next_page_number yield scrapy.Request(next_url, self.parse_catalog)
def parse_catalog(self, response): """Generate the catalog (list of manga) of the site. @url http://mangaseeonline.us/directory/ @returns items 0 @returns request 3500-4500 """ xp = '//a[@class="ttip"]/@href' for url in response.xpath(xp).extract(): manga = Manga() # URL manga['url'] = response.urljoin(url) meta = {'manga': manga} yield response.follow(manga['url'], self.parse_collection, meta=meta)
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url http://www.mangahere.cc/mangalist/ @returns items 0 @returns request 18000 22000 """ xp = '//a[@class="manga_info"]' for item in response.xpath(xp): manga = Manga() # URL xp = './@href' url = item.xpath(xp).extract_first() manga['url'] = response.urljoin(url) meta = {'manga': manga} yield response.follow(manga['url'], self.parse_collection, meta=meta)
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://unionmangas.site/ @returns items 0 @returns request 10 100 """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() # Get all manga's URL from the same page and update it via # `parse_collection` xp = '//a[@class="link-titulo"]/@href' for url in response.xpath(xp).extract(): manga = Manga(url=url) meta = {'manga': manga} yield response.follow(url, self.parse_collection, meta=meta)
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://submanga.org @returns items 0 @returns request 5 10 """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() # Get all manga's URL from the same page and update it via # `parse_collection` xp = '//div[@class="timeline-entry"]//a/@href' for url in response.xpath(xp).extract(): manga = Manga(url=url) meta = {'manga': manga} request = scrapy.Request(url, self.parse_collection, meta=meta) yield request
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url http://www.mangahere.co/mangalist/ @returns items 0 0 @returns request 15000 20000 """ xp = '//a[@class="manga_info"]' for item in response.xpath(xp): manga = Manga() # Name xp = './text()' manga['name'] = item.xpath(xp).extract() # URL xp = './@href' manga['url'] = response.urljoin(item.xpath(xp).extract()[0]) meta = {'manga': manga} request = scrapy.Request(manga['url'], self.parse_collection, meta=meta) yield request
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://www.mangahere.cc/latest/ @returns items 0 @returns request 25 200 """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() # Get all manga's URL from the same page and update it via # `parse_collection` xp = '//a[@class="manga_info"]/@href' for url in response.xpath(xp).extract(): url = response.urljoin(url) manga = Manga(url=url) meta = {'manga': manga} request = scrapy.Request(url, self.parse_collection, meta=meta) yield request # Check the oldest update date xp = '//span[@class="time"]/text()' update_date = response.xpath(xp).extract()[-1] update_date = convert_to_date(update_date) if update_date < until: return # Next page xp = '//a[@class="next"]/@href' next_url = response.xpath(xp).extract_first() if next_url: next_url = response.urljoin(next_url) meta = {'until': until} yield scrapy.Request(next_url, self.parse_latest, meta=meta)
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url https://mangadex.org/ @returns items 0 @returns request 120 """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() # Get all manga's URL from the same page and update it via # `parse_collection` xp = '//a[@class="manga_title"]/@href' for url in response.xpath(xp).extract(): url = response.urljoin(url) manga = Manga(url=url) meta = {'manga': manga} request = response.follow(url, self.parse_collection, meta=meta) yield request # Check the oldest update date xp = '//time/@datetime' update_date = response.xpath(xp).extract()[-1].strip() update_date = convert_to_date(update_date) if update_date < until: return # Next page xp = '//ul[@class="pagination"]/li[@class="active"]' \ '/following-sibling::li[@class="paging"]/a/@href' next_url = response.xpath(xp).extract_first() if next_url: yield response.follow(next_url, self._parse_issues, meta=meta)
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://bato.to @returns items 0 @returns request 25 100 """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() # Get all manga's URL from the same page and update it via # `parse_collection` xp = '//td[@colspan="5"]/a[@style="font-weight:bold;"]/@href' for url in response.xpath(xp).extract(): manga = Manga(url=url) meta = {'manga': manga} request = scrapy.Request(url, self.parse_collection, meta=meta) yield request # Check the oldest update date xp = '//td[contains(@style, "font-size: 11px")]/text()' update_date = response.xpath(xp).extract()[-1].strip() update_date = convert_to_date(update_date) if update_date < until: return # Next page xp = '//a[@title="Older Releases"]/@href' next_url = response.xpath(xp).extract_first() if next_url: next_url = response.urljoin(next_url) meta = {'until': until} yield scrapy.Request(next_url, self.parse_latest, meta=meta)
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url https://mangadex.org/titles/2/1 @returns items 0 @returns request 40 """ xp = '//div[@class="row"]/div' for item in response.xpath(xp): manga = Manga() # URL xp = './/a[@class="manga_title"]/@href' url = item.xpath(xp).extract_first() manga['url'] = response.urljoin(url) meta = {'manga': manga} yield response.follow(url, self.parse_collection, meta=meta) # Next page xp = '//ul[@class="pagination"]/li[@class="active"]' \ '/following-sibling::li[@class="paging"]/a/@href' next_url = response.xpath(xp).extract_first() if next_url: yield response.follow(next_url, self.parse_catalog)
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url https://www.mangareader.net/latest @returns items 0 @returns request 25 100 """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() # Get all manga's URL from the same page and update it via # `parse_collection` xp = '//a[@class="chapter"]/@href' for url in response.xpath(xp).extract(): url = response.urljoin(url) manga = Manga(url=url) meta = {'manga': manga} yield response.follow(url, self.parse_collection, meta=meta) # Check the oldest update date xp = '//td[@class="c1"]/text()' update_date = response.xpath(xp).extract()[-1] update_date = convert_to_date(update_date) if update_date < until: return # Next page xp = '//div[@id="latest"]/div[@id="sp"]/a[contains(., ">")]/@href' next_url = response.xpath(xp).extract_first() if next_url: meta = {'until': until} yield response.follow(next_url, self.parse_latest, meta=meta)
def parse_catalog(self, response): """Generate the catalog (list of mangas) of the site. @url https://bato.to/browse?page=2 @returns items 0 @returns request 50 70 """ xp = '//div[@id="series-list"]//div[@class="item-text"]' for item in response.xpath(xp): manga = Manga() # URL xp = './a/@href' url = item.xpath(xp).extract_first() manga['url'] = response.urljoin(url) meta = {'manga': manga} yield response.follow(url, self.parse_collection, meta=meta) # Next page re_ = r'@click="onClickPage\((.*)\)"' next_page_number = re.findall(re_, response.body_as_unicode())[-1] if next_page_number: next_url = NEXT_PAGE % next_page_number yield response.follow(next_url, self.parse_catalog)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url https://mangadex.org/manga/39/one-piece @returns items 0 @returns request 1 """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//h3[@class="panel-title"]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//th[contains(text(),"%s")]' \ '/following-sibling::td/descendant-or-self::*/text()' manga['alt_name'] = response.xpath(xp % 'Alt name(s):').extract() # Author manga['author'] = response.xpath(xp % 'Author:').re(r'([^,]+),?') # Artist manga['artist'] = response.xpath(xp % 'Artist:').re(r'([^,]+),?') # Reading direction xp = '//h3[@class="panel-title"]/img/@alt' manga['reading_direction'] = response.xpath(xp).extract_first() # Status xp = '//th[contains(text(),"%s")]' \ '/following-sibling::td/descendant-or-self::*/text()' manga['status'] = response.xpath(xp % 'Pub. status:').extract_first() # Genres demographic = response.xpath(xp % 'Demographic:').extract() genres = response.xpath(xp % 'Genres:').extract() manga['genres'] = demographic + genres # Rank rank = response.xpath(xp % 'Rating:').extract_first() manga['rank'] = 100 * convert_to_number(rank) # Rank order manga['rank_order'] = 'DESC' # Description manga['description'] = response.xpath(xp % 'Description:').extract() # Cover image xp = '//img[@class="border-radius"]/@src' url = response.xpath(xp).extract_first() manga['image_urls'] = [response.urljoin(url)] # Information needed to deduce the issue order xp = '//p[@class="text-center"]/text()' chapters = response.xpath(xp).re_first(r'of (.*) chapters') if chapters: chapters = convert_to_number(chapters, as_int=True) else: xp = '//tr[contains(@id,"chapter_")]' chapters = len(response.xpath(xp)) # If the manga is empty (is frequent in MangaDex), end the # processing if not chapters: return # Parse the manga issues list manga['issues'] = [] meta = { 'manga': manga, 'chapters': chapters, } url = response.url + '/chapters/1' return response.follow(url, self._parse_issues, meta=meta)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://kissmanga.com/Manga/Naruto @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres rank rank_order description image_urls @scrapes issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//div[@class="barContent"]//a[@class="bigChar"]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//span[@class="info" and contains(text(), "%s")]' \ '/following-sibling::a/text()' manga['alt_name'] = response.xpath(xp % 'Other name:').extract() # Author manga['author'] = response.xpath(xp % 'Author:').extract() # Artist manga['artist'] = manga['author'] # Reading direction manga['reading_direction'] = 'RL' # Genres manga['genres'] = response.xpath(xp % 'Genres:').extract() # Status xp = '//span[@class="info" and contains(text(), "%s")]' \ '/following-sibling::text()[1]' manga['status'] = response.xpath(xp % 'Status:').extract() # Rank manga['rank'] = response.xpath(xp % 'Views:').re(r'(\d+).') # Rank order manga['rank_order'] = 'DESC' # Description xp = '//p[span[@class="info" and contains(text(), "%s")]]'\ '/following-sibling::p[1]/text()' manga['description'] = response.xpath(xp % 'Summary:').extract() # Cover image xp = '//div[@id="rightside"]//img/@src' url = response.xpath(xp).extract_first() manga['image_urls'] = [response.urljoin(url)] # Parse the manga issues list manga['issues'] = [] xp = '//table[@class="listing"]/tr[td]' lines = response.xpath(xp) for line in lines: issue = Issue(language='EN') # Name xp = './/a/text()' issue['name'] = line.xpath(xp).extract() # Number # Some examples that this regex needs to address # 1/11 Vol.003 Ch.009.006: Omake 004-koma # 21st Century Boys 014 # Mob Psycho 100 Ch.099.001: Mob # Mob Psycho 100 Ch.098.002 # Fantastic World Vol.001 Ch.002 # Black Clover 118 - Mage X # Black Clover 099: Family xp = './/a/text()' number = line.xpath(xp).re( r'(?:[Cc]h.|[Ee]p.|[Cc]haper|[Pp]art.)(\d[.\d]+)' r'|(\d[.\d]+)[ :-]+' r'|(\d[.\d]+)$') issue['number'] = number # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = './td[2]/text()' issue['release'] = line.xpath(xp).re(r'\d{1,2}/\d{1,2}/\d{4}') # URL xp = './/a/@href' url = line.xpath(xp).extract_first() issue['url'] = response.urljoin(url) manga['issues'].append(issue) return manga
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://www.mangahere.cc/manga/angel_densetsu/ @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres rank rank_order description image_urls @scrapes issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # MangaHere returns 200 for 404 pages xp = '//div[@class="error_404"]' if response.xpath(xp).extract(): return # Check if manga is licensed xp = '//div[@class="detail_list"]/div[@class="mt10 color_ff00 mb10"]' if response.xpath(xp).extract(): return # URL manga['url'] = response.url # Name xp = '//meta[@property="og:title"]/@content' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//li[label[contains(text(),"%s")]]/text()' manga['alt_name'] = response.xpath( xp % 'Alternative Name:').re(r'([^;]+)') # Author xp = '//li[label[contains(text(),"%s")]]/a/text()' manga['author'] = response.xpath(xp % 'Author(s):').extract() # Artist manga['artist'] = response.xpath(xp % 'Artist(s):').extract() # Reading direction manga['reading_direction'] = 'RL' # Status xp = '//li[label[contains(text(),"%s")]]/text()' manga['status'] = response.xpath(xp % 'Status:').extract_first() # Genres manga['genres'] = response.xpath(xp % 'Genre(s):').re(r'([^,]+)') # Rank manga['rank'] = response.xpath(xp % 'Rank:').extract() # Rank order manga['rank_order'] = 'ASC' # Description xp = '//li[label[contains(text(),"%s")]]/p[@id="show"]/text()' manga['description'] = response.xpath(xp % 'Summary:').extract() # Cover image xp = '//img[@class="img"]/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//div[@class="detail_list"]/ul[not(@class)]/li' lines = response.xpath(xp) # Check if the lines are empty if len(lines) == 1 and 'No Manga Chapter' in lines[0].extract(): return for line in lines: issue = Issue(language='EN') # Name xp = './/a/text()' name_1 = line.xpath(xp).extract() xp = './/span[@class="mr6"]/text()' name_2 = line.xpath(xp).extract() xp = './/span[@class="left"]/text()' name_3 = line.xpath(xp).extract() issue['name'] = name_1 + name_2 + name_3 # Number xp = './/a/text()' issue['number'] = line.xpath(xp).re( r'([.\d]+)\s*$') # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = './/span[@class="right"]/text()' issue['release'] = line.xpath(xp).extract() # URL xp = './/a/@href' url = line.xpath(xp).extract_first() issue['url'] = response.urljoin(url) manga['issues'].append(issue) return manga
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://bato.to @returns items 1 100 @returns request 0 1 @scrapes url name issues """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() xp = './/tr[contains(@class, "row")]' last_row, manga = None, None for update in response.xpath(xp): row = update.xpath('@class').extract()[0].split()[0] if row != last_row: if manga: yield manga manga = Manga(issues=[]) # Name xp = 'td[2]/a[2]/text()' manga['name'] = update.xpath(xp).extract() # URL xp = 'td[2]/a[2]/@href' manga['url'] = update.xpath(xp).extract() else: issue = Issue() # Name xp = './/td/a[img/@style="vertical-align:middle;"]/text()' issue['name'] = update.xpath(xp).extract() # Number issue['number'] = update.xpath(xp).re( r'Ch.(?:Story )?([.\d]+)') # Order # This is only an estimation for now issue['order'] = issue['number'] # Language xp = './/td/div/@title' issue['language'] = update.xpath(xp).extract() # Release xp = './/td[last()]/text()' issue['release'] = update.xpath(xp).extract() # URL xp = './/td/a[img/@style="vertical-align:middle;"]/@href' url = update.xpath(xp).extract()[0] issue['url'] = response.urljoin(url) # Check if is a new update update_date = convert_to_date(issue['release'][0].strip()) if update_date < until: return manga['issues'].append(issue) last_row = row # Return the last manga if manga: yield manga # Next page xp = '//a[@title="Older Releases"]/@href' next_url = response.xpath(xp).extract() if next_url: next_url = response.urljoin(next_url[0]) meta = {'until': until} yield scrapy.Request(next_url, self.parse_latest, meta=meta)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://bato.to/comic/_/comics/angel-densetsu-r460 @returns items 1 1 @returns request 0 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL # Batoto can have different URLs for comics (_/comics/, # _/sp/), so here we update the manga with the real one. manga['url'] = response.url # Name xp = '//h1[@class="ipsType_pagetitle"]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//td[contains(text(),"%s")]/following-sibling::td/*/text()' manga['alt_name'] = response.xpath(xp % 'Alt Names:').re(r'([^,;]+)') # Author manga['author'] = response.xpath(xp % 'Author:').extract() # Artist manga['artist'] = response.xpath(xp % 'Artist:').extract() # Reading direction manga['reading_direction'] = 'RL' # Status xp = '//td[contains(text(),"%s")]/following-sibling::td/text()' manga['status'] = response.xpath(xp % 'Status:').extract() # Genres xp = '//td[contains(text(),"%s")]/following-sibling::td/a/span/text()' manga['genres'] = response.xpath(xp % 'Genres:').extract() # Description xp = '//td[contains(text(),"%s")]/following-sibling::td/text()' manga['description'] = response.xpath(xp % 'Description:').extract() # Cover image xp = '//div[@class="ipsBox"]/div/div/img/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//tr[contains(@class,"chapter_row")' \ ' and not(contains(@class,"chapter_row_expand"))]' lines = response.xpath(xp) for line in lines: issue = Issue() # Name xp = './td[1]/a/text()' issue['name'] = line.xpath(xp).extract() # Number issue['number'] = line.xpath(xp).re(r'Ch.(?:Story )?([.\d]+)') # Order issue['order'] = len(lines) - len(manga['issues']) # Language xp = './td[2]/div/@title' issue['language'] = line.xpath(xp).extract() # Release xp = './td[5]/text()' issue['release'] = line.xpath(xp).extract() # URL xp = './td[1]/a/@href' url = line.xpath(xp).extract()[0] issue['url'] = response.urljoin(url) manga['issues'].append(issue) yield manga
def parse_latest(self, response, until=None): """Generate the list of new mangas until a date @url http://www.mangahere.co/latest/ @returns items 1 100 @returns request 0 1 @scrapes url name issues """ if not until: if 'until' in response.meta: until = response.meta['until'] else: until = date.today() xp = '//div[@class="manga_updates"]/dl' for update in response.xpath(xp): # Check if is a new update xp = './/span[@class="time"]/text()' update_date = update.xpath(xp).extract() update_date = convert_to_date(update_date[0]) if update_date < until: return manga = Manga() # Name xp = './/a[@class="manga_info"]/text()' manga['name'] = update.xpath(xp).extract() # URL xp = './/a[@class="manga_info"]/@href' url = update.xpath(xp).extract() manga['url'] = response.urljoin(url[0]) # Parse the manga issues list manga['issues'] = [] xp = './dd' for line in update.xpath(xp): issue = Issue(language='EN') # Name xp = 'a/text()' issue['name'] = line.xpath(xp).extract() # Number xp = 'a/text()' issue['number'] = line.xpath(xp).re(r'([.\d]+)\s*$') # Order # This is only an estimation for now issue['order'] = issue['number'] # Release issue['release'] = update_date # URL xp = 'a/@href' url = line.xpath(xp).extract() issue['url'] = response.urljoin(url[0]) manga['issues'].append(issue) yield manga # Next page xp = '//a[@class="next"]/@href' next_url = response.xpath(xp).extract() if next_url: next_url = response.urljoin(next_url[0]) meta = {'until': until} yield scrapy.Request(next_url, self.parse_latest, meta=meta)
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://unionmangas.net/manga/bleach @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description image_urls issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//div[@class="col-md-12"]/h2/text()' manga['name'] = response.xpath(xp).extract_first() # Alternate name manga['alt_name'] = manga['name'] # Author xp = '//label[contains(text(), "%s")]/following-sibling::text()' manga['author'] = response.xpath(xp % 'Autor:').extract() # Artist manga['artist'] = response.xpath(xp % 'Artista:').extract() # Reading direction manga['reading_direction'] = 'RL' # Status (Ativo / Completo) xp = '//label[contains(text(), "Status:")]' \ '/following-sibling::span/text()' manga['status'] = response.xpath(xp).extract() # Genres xp = u'//label[contains(text(), "Gênero(s):")]' \ u'/following-sibling::a/text()' manga['genres'] = response.xpath(xp).extract() # Description xp = '//div[@class="panel-body"]/text()' manga['description'] = response.xpath(xp).extract() # Cover image xp = '//img[@class="img-thumbnail"]/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//div[@class="col-xs-6 col-md-6"]' lines = response.xpath(xp) for line in lines: issue = Issue(language='PT') # Name xp = 'a/text()' issue['name'] = line.xpath(xp).extract() # Number issue['number'] = line.xpath(xp).re(r'Cap. ([.\d]+)$') # Order issue['order'] = len(lines) - len(manga['issues']) # Release xp = 'span/text()' issue['release'] = line.xpath(xp).re(r'\d{2}/\d{2}/\d{4}') # URL xp = 'a/@href' issue['url'] = line.xpath(xp).extract() manga['issues'].append(issue) return manga
def parse_collection(self, response, manga=None): """Generate the list of issues for a manga @url http://submanga.org/bloody-cross @returns items 1 @returns request 0 @scrapes url name alt_name author artist reading_direction @scrapes status genres description image_urls issues """ if 'manga' in response.meta: manga = response.meta['manga'] else: manga = Manga(url=response.url) # URL manga['url'] = response.url # Name xp = '//a[@class="btn-link text-semibold text-mint"][2]/text()' manga['name'] = response.xpath(xp).extract() # Alternate name xp = '//span[contains(text(), "%s")]/following-sibling::text()' title = u'Títulos alternativos:' manga['alt_name'] = response.xpath(xp % title).re(r'([^,;]+)') # Author manga['author'] = response.xpath(xp % 'Autor:').extract() # Artist manga['artist'] = manga['author'] # Reading direction manga['reading_direction'] = 'RL' # Status (Finalizado / En curso) xp = '//span[@class="text-2x text-thin"]/text()' manga['status'] = response.xpath(xp).extract() # Genres xp = '//span[contains(text(), "Generos:")]/following-sibling::a/text()' manga['genres'] = response.xpath(xp).extract() # Description xp = '//p[@class="text-justify"]/text()' manga['description'] = response.xpath(xp).extract() # Cover image xp = '//img[@class="img-cover-m"]/@src' manga['image_urls'] = response.xpath(xp).extract() # Parse the manga issues list manga['issues'] = [] xp = '//table[@id="caps-list"]//tr' lines = response.xpath(xp) for line in lines: xp = 'td[3]/a/img/@alt' langs = line.xpath(xp).extract() for lang in langs: issue = Issue(language=lang) # Name xp = 'td[2]/a/text()' issue['name'] = line.xpath(xp).extract() # Number xp = 'td[1]/a/text()' issue['number'] = line.xpath(xp).extract_first() # Order issue['order'] = int(issue['number']) # Release xp = 'td[4]/a/span/text()' issue['release'] = line.xpath(xp).extract() # URL xp = 'td[1]/a/@href' url = line.xpath(xp).extract_first() issue['url'] = '%s/%s' % (url, lang) manga['issues'].append(issue) return manga