def getLyrics(track, artist): """ Returns a dictionary with possible keys: lyrics If no match found, returns empty dictionary """ track = track.encode('utf-8') artist = artist.encode('utf-8') url = 'http://www.pandora.com/music/song/%s/%s' % (urllib.quote_plus( artist.lower()), urllib.quote_plus(track.lower())) ret = getSourceCode(url) try: trackUid = regex_trackUid.search(ret).group(1) intermMatch = regex_lyricIdCheckSum.search(ret) lyricId = intermMatch.group(1) checkSum = intermMatch.group(2) nonExplicit = 'false' authToken = 'null' except AttributeError: return {} else: return __getEncryptedLyrics(trackUid, lyricId, checkSum, nonExplicit, authToken)
def downloadChapter(self, downloadThread, max_pages, url, manga_chapter_prefix, current_chapter): pageIndex = 0 for page in MangaPanda.re_getPage.findall(getSourceCode(url, self.proxy)): if (self.verbose_FLAG): print(self.chapters[current_chapter][1] + ' | ' + 'Page %s / %i' % (page[1], max_pages)) pageUrl = 'http://www.mangapanda.com' + page[0] self.downloadImage(downloadThread, page[1], pageUrl, manga_chapter_prefix) pageIndex = pageIndex + 1
def downloadChapter(self, downloadThread, max_pages, url, manga_chapter_prefix, current_chapter): pageIndex = 0 for page in MangaReader.re_getPage.findall(getSourceCode(url, self.proxy)): if (self.verbose_FLAG): print(self.chapters[current_chapter][1] + ' | ' + 'Page %s / %i' % (page[1], max_pages)) pageUrl = 'http://www.mangareader.net' + page[0] self.downloadImage(downloadThread, page[1], pageUrl, manga_chapter_prefix) pageIndex = pageIndex + 1
def parseSite(self): print('Beginning MangaPanda check: %s' % self.manga) url = 'http://www.mangapanda.com/alphabetical' source = getSourceCode(url, self.proxy) allSeries = MangaPanda.re_getSeries.findall( source[source.find('series_col'):]) keyword = self.selectFromResults(allSeries) url = 'http://www.mangapanda.com%s' % keyword source = getSourceCode(url, self.proxy) self.chapters = MangaPanda.re_getChapters.findall(source) lowerRange = 0 for i in range(0, len(self.chapters)): self.chapters[i] = ('http://www.mangapanda.com%s' % self.chapters[i][0], '%s%s' % (self.chapters[i][1], self.chapters[i][2]), self.chapters[i][1]) if (not self.auto): print('(%i) %s' % (i + 1, self.chapters[i][1])) else: if (self.lastDownloaded == self.chapters[i][1]): lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) self.isPrependMangaName = False if (not self.auto): self.chapters_to_download = self.selectChapters(self.chapters) else: if (lowerRange == upperRange): raise self.NoUpdates for i in range(lowerRange, upperRange): self.chapters_to_download.append(i) self.isPrependMangaName = True return
def downloadAnimea(self, manga, chapter_start, chapter_end, download_path, download_format): for current_chapter in range(chapter_start, chapter_end + 1): manga_chapter_prefix = manga.lower().replace('-', '_') + '_' + str(current_chapter).zfill(3) if (os.path.exists(download_path + manga_chapter_prefix + '.cbz') or os.path.exists(download_path + manga_chapter_prefix + '.zip')) and overwrite_FLAG == False: print('Chapter ' + str(current_chapter) + ' already downloaded, skipping to next chapter...') continue; url = 'http://manga.animea.net/'+ manga + '-chapter-' + str(current_chapter) + '-page-1.html' source = getSourceCode(url) max_pages = int(re.compile('of (.*?)</title>').search(source).group(1)) for page in range(1, max_pages + 1): url = 'http://manga.animea.net/'+ manga + '-chapter-' + str(current_chapter) + '-page-' + str(page) + '.html' source = getSourceCode(url) img_url = re.compile('img src="(http.*?.[jp][pn]g)"').search(source).group(1) print('Chapter ' + str(current_chapter) + ' / ' + 'Page ' + str(page)) print(img_url) downloadImage(img_url, os.path.join('mangadl_tmp', manga_chapter_prefix + '_' + str(page).zfill(3))) compress(manga_chapter_prefix, download_path, max_pages, download_format)
def downloadChapter(self, downloadThread, max_pages, url, manga_chapter_prefix, current_chapter): """We ignore max_pages, because you can't regex-search that under Batoto.""" s = getSourceCode(url, self.proxy) soup = BeautifulSoup(s) ol = soup.find("select", id="page_select")("option") n = 1 for i in ol: if self.verbose_FLAG: print(i['value']) self.downloadImage(downloadThread, n, i['value'], manga_chapter_prefix) n += 1
def parseSite(self): print('Beginning OtakuWorks check: %s' % self.manga) url = 'http://www.otakuworks.com/search/%s' % '+'.join(self.manga.split()) source = getSourceCode(url) info = OtakuWorks.re_getMangas.findall(source) # we either have 0 search results or we have already been redirected to the manga homepage if len(info) != 0: keyword = self.selectFromResults(info) source = getSourceCode(keyword) if(source.find('has been licensed and as per request all releases under it have been removed.') != -1): raise self.MangaNotFound('It has been removed.') # can't pre-compile this because relies on class name self.chapters = re.compile('a href="([^>]*%s[^>]*)">([^<]*#([^<]*))</a>' % '-'.join(fixFormatting(self.manga, '.').replace('_', ' ').split())).findall(source) self.chapters.reverse() lowerRange = 0 for i in range(0, len(self.chapters)): self.chapters[i] = ('http://www.otakuworks.com' + self.chapters[i][0] + '/read', self.chapters[i][1], self.chapters[i][2]) if (not self.auto): print('(%i) %s' % (i + 1, self.chapters[i][1])) else: if (self.lastDownloaded == self.chapters[i][1]): lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) if (not self.auto): self.chapters_to_download = self.selectChapters(self.chapters) else: if ( lowerRange == upperRange): raise self.NoUpdates for i in range (lowerRange, upperRange): self.chapters_to_download.append(i) return
def parseSite(self): print('Beginning MangaPanda check: %s' % self.manga) url = 'http://www.mangapanda.com/alphabetical' source = getSourceCode(url, self.proxy) allSeries = MangaPanda.re_getSeries.findall(source[source.find('series_col'):]) keyword = self.selectFromResults(allSeries) url = 'http://www.mangapanda.com%s' % keyword source = getSourceCode(url, self.proxy) self.chapters = MangaPanda.re_getChapters.findall(source) lowerRange = 0 for i in range(0, len(self.chapters)): self.chapters[i] = ('http://www.mangapanda.com%s' % self.chapters[i][0], '%s%s' % (self.chapters[i][1], self.chapters[i][2]), self.chapters[i][1]) if (not self.auto): print('(%i) %s' % (i + 1, self.chapters[i][1])) else: if (self.lastDownloaded == self.chapters[i][1]): lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) self.isPrependMangaName = False if (not self.auto): self.chapters_to_download = self.selectChapters(self.chapters) else: if (lowerRange == upperRange): raise self.NoUpdates for i in range (lowerRange, upperRange): self.chapters_to_download .append(i) self.isPrependMangaName = True return
def downloadChapter(self, downloadThread, max_pages, url, manga_chapter_prefix, current_chapter): pageIndex = 0 pages = EatManga.re_getPage.findall(getSourceCode(url, self.proxy)) #Remove duplicate pages if any and ensure order pages = list(OrderedDict.fromkeys(pages)) for page in pages: if (self.verbose_FLAG): print(self.chapters[current_chapter][1] + ' | ' + 'Page %s / %i' % (page[1], max_pages)) pageUrl = 'http://eatmanga.com%s' % page[0] self.downloadImage(downloadThread, page[1], pageUrl, manga_chapter_prefix) pageIndex = pageIndex + 1
def parseSite(self): print('Beginning EatManga check: %s' % self.manga) url = 'http://eatmanga.com/Manga-Scan/%s' % self.fixFormatting( self.manga) if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) self.chapters = EatManga.re_getChapters.findall(source) self.chapters.reverse() if not self.chapters: raise self.MangaNotFound lowerRange = 0 for i in range(0, len(self.chapters)): if 'upcoming' in self.chapters[i][0]: #Skip not available chapters del self.chapters[i] continue self.chapters[i] = ('http://eatmanga.com%s' % self.chapters[i][0], self.chapters[i][2], self.chapters[i][2]) if (not self.auto): print('(%i) %s' % (i + 1, self.chapters[i][1])) else: if (self.lastDownloaded == self.chapters[i][1]): lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) if (not self.auto): self.chapters_to_download = self.selectChapters(self.chapters) else: if (lowerRange == upperRange): raise self.NoUpdates for i in range(lowerRange, upperRange): self.chapters_to_download.append(i) self.isPrependMangaName = True return
def __getEncryptedLyrics(trackUid, lyricId, checkSum, nonExplicit, authToken): url = "http://www.pandora.com/services/ajax/?" url += urllib.urlencode({ 'method' :'lyrics.getLyrics', 'trackUid' :trackUid, 'checkSum' :checkSum, 'nonExplicit' :nonExplicit, 'authToken' :authToken }) ret = getSourceCode(url) decryptionKey = re.search('var k="([^"]*)"', ret).group(1) # functions in javascript can contain ", which makes python dictionary # parsing throw errors, workaround by replacing them out ret = re.sub('(function[^,]*)', '0', ret).replace('\\u00',r'\x') # use ast.literal_eval vs. eval because it's safer encrypted = ast.literal_eval(ret) encryptedLyrics= encrypted['lyrics'] return __decryptLyrics(encryptedLyrics, decryptionKey)
def __getEncryptedLyrics(trackUid, lyricId, checkSum, nonExplicit, authToken): url = "http://www.pandora.com/services/ajax/?" url += urllib.urlencode({ 'method': 'lyrics.getLyrics', 'trackUid': trackUid, 'checkSum': checkSum, 'nonExplicit': nonExplicit, 'authToken': authToken }) ret = getSourceCode(url) decryptionKey = re.search('var k="([^"]*)"', ret).group(1) # functions in javascript can contain ", which makes python dictionary # parsing throw errors, workaround by replacing them out ret = re.sub('(function[^,]*)', '0', ret).replace('\\u00', r'\x') # use ast.literal_eval vs. eval because it's safer encrypted = ast.literal_eval(ret) encryptedLyrics = encrypted['lyrics'] return __decryptLyrics(encryptedLyrics, decryptionKey)
def getLyrics(track, artist): """ Returns a dictionary with possible keys: lyrics If no match found, returns empty dictionary """ track = track.encode('utf-8') artist = artist.encode('utf-8') url = 'http://www.pandora.com/music/song/%s/%s' % (urllib.quote_plus(artist.lower()), urllib.quote_plus(track.lower())) ret = getSourceCode(url) try: trackUid = regex_trackUid.search(ret).group(1) intermMatch = regex_lyricIdCheckSum.search(ret) lyricId = intermMatch.group(1) checkSum = intermMatch.group(2) nonExplicit = 'false' authToken = 'null' except AttributeError: return {} else: return __getEncryptedLyrics(trackUid, lyricId, checkSum, nonExplicit, authToken)
def get_next_url(self, c): s = getSourceCode(c, self.proxy) soup = BeautifulSoup(s) l = soup.find("img", title="Next Chapter").parent return l['href']
def parseSite(self): """ Parses list of chapters and URLs associated with each one for the given manga and site. """ print('Beginning MangaHere check: %s' % self.manga) # jump straight to expected URL and test if manga removed url = 'http://www.mangahere.com/manga/%s/' % self.fixFormatting( self.manga) if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) if (source is None or 'the page you have requested can' in source): # do a 'begins-with' search, then a 'contains' search url = 'http://www.mangahere.com/search.php?name=%s' % '+'.join( self.manga.split()) if self.verbose_FLAG: print(url) try: source = getSourceCode(url, self.proxy) seriesResults = MangaHere.re_getSeries.findall(source) seriesResults = [] if source is not None: seriesResults = MangaHere.re_getSeries.findall(source) if (0 == len(seriesResults)): url = 'http://www.mangahere.com/search.php?name=%s' % '+'.join( self.manga.split()) if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) if source is not None: seriesResults = MangaHere.re_getSeries.findall(source) # 0 results except AttributeError: raise self.MangaNotFound( 'It doesn\'t exist, or cannot be resolved by autocorrect.') else: keyword = self.selectFromResults(seriesResults) if self.verbose_FLAG: print("Keyword: %s" % keyword) url = 'http://www.mangahere.com/manga/%s/' % keyword if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) else: # The Guess worked keyword = self.fixFormatting(self.manga) if self.verbose_FLAG: print("Keyword: %s" % keyword) # other check for manga removal if our initial guess for the name was wrong if ('it is not available in.' in source): raise self.MangaNotFound('It has been removed.') # that's nice of them #url = 'http://www.mangahere.com/cache/manga/%s/chapters.js' % keyword #source = getSourceCode(url, self.proxy) # chapters is a 2-tuple # chapters[0] contains the chapter URL # chapters[1] contains the chapter title isChapterOnly = False # can't pre-compile this because relies on class name re_getChapters = re.compile( 'a.*?href="http://.*?mangahere.*?/manga/%s/(v[\d]+)/(c[\d]+)/[^"]*?"' % keyword) self.chapters = re_getChapters.findall(source) if not self.chapters: if self.verbose_FLAG: print("Trying chapter only regex") isChapterOnly = True re_getChapters = re.compile( 'a.*?href="http://.*?mangahere.*?/manga/%s/(c[\d]+)/[^"]*?"' % keyword) self.chapters = re_getChapters.findall(source) self.chapters.reverse() # code used to both fix URL from relative to absolute as well as verify last downloaded chapter for XML component lowerRange = 0 if isChapterOnly: for i in range(0, len(self.chapters)): if self.verbose_FLAG: print("%s" % self.chapters[i]) if (self.auto): if (self.lastDownloaded == self.chapters[i]): lowerRange = i + 1 self.chapters[i] = ('http://www.mangahere.com/manga/%s/%s' % (keyword, self.chapters[i]), self.chapters[i], self.chapters[i]) else: for i in range(0, len(self.chapters)): if self.verbose_FLAG: print("%s %s" % (self.chapters[i][0], self.chapters[i][1])) self.chapters[i] = ( 'http://www.mangahere.com/manga/%s/%s/%s' % (keyword, self.chapters[i][0], self.chapters[i][1]), self.chapters[i][0] + "." + self.chapters[i][1], self.chapters[i][1]) if (self.auto): if (self.lastDownloaded == self.chapters[i][1]): lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) # Validate whether the last chapter is a if (self.verbose_FLAG): print(self.chapters[upperRange - 1]) print("Validating chapter: %s" % self.chapters[upperRange - 1][0]) source = getSourceCode(self.chapters[upperRange - 1][0], self.proxy) if ('not available yet' in source): # If the last chapter is not available remove it from the list del self.chapters[upperRange - 1] upperRange = upperRange - 1 # which ones do we want? if (not self.auto): for i in range(0, upperRange): print('(%i) %s' % (i + 1, self.chapters[i][1])) self.chapters_to_download = self.selectChapters(self.chapters) # XML component else: if (lowerRange == upperRange): raise self.NoUpdates for i in range(lowerRange, upperRange): self.chapters_to_download.append(i) return
def parseSite(self): """ Parses list of chapters and URLs associated with each one for the given manga and site. """ print('Beginning MangaFox check: %s' % self.manga) # jump straight to expected URL and test if manga removed url = 'http://mangafox.me/manga/%s/' % self.fixFormatting(self.manga) if self.verbose_FLAG: print(url) source, redirectURL = getSourceCode(url, self.proxy, True) if (redirectURL != url or source is None or 'the page you have requested cannot be found' in source): # Could not find the manga page by guessing # Use the website search url = 'http://mangafox.me/search.php?name_method=bw&name=%s&is_completed=&advopts=1' % '+'.join( self.manga.split()) if self.verbose_FLAG: print(url) try: source = getSourceCode(url, self.proxy) seriesResults = [] if source is not None: seriesResults = MangaFox.re_getSeries.findall(source) if (0 == len(seriesResults)): url = 'http://mangafox.me/search.php?name_method=cw&name=%s&is_completed=&advopts=1' % '+'.join( self.manga.split()) if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) if source is not None: seriesResults = MangaFox.re_getSeries.findall(source) # 0 results except AttributeError: raise self.MangaNotFound( 'It doesn\'t exist, or cannot be resolved by autocorrect.') else: keyword = self.selectFromResults(seriesResults) if self.verbose_FLAG: print("Keyword: %s" % keyword) url = 'http://mangafox.me/manga/%s/' % keyword if self.verbose_FLAG: print("URL: %s" % url) source = getSourceCode(url, self.proxy) if (source is None): raise self.MangaNotFound('Search Failed to find Manga.') else: # The Guess worked keyword = self.fixFormatting(self.manga) if self.verbose_FLAG: print("Keyword: %s" % keyword) if ('it is not available in Manga Fox.' in source): raise self.MangaNotFound('It has been removed.') # that's nice of them #url = 'http://mangafox.me/cache/manga/%s/chapters.js' % keyword #source = getSourceCode(url, self.proxy) # chapters is a 2-tuple # chapters[0] contains the chapter URL # chapters[1] contains the chapter title isChapterOnly = False # can't pre-compile this because relies on class name re_getChapters = re.compile( 'a href="http://.*?mangafox.*?/manga/%s/(v[\d]+)/(c[\d]+)/[^"]*?" title' % keyword) self.chapters = re_getChapters.findall(source) if not self.chapters: if self.verbose_FLAG: print("Trying chapter only regex") isChapterOnly = True re_getChapters = re.compile( 'a href="http://.*?mangafox.*?/manga/%s/(c[\d]+)/[^"]*?" title' % keyword) self.chapters = re_getChapters.findall(source) self.chapters.reverse() # code used to both fix URL from relative to absolute as well as verify last downloaded chapter for XML component lowerRange = 0 if isChapterOnly: for i in range(0, len(self.chapters)): if self.verbose_FLAG: print("%s" % self.chapters[i]) if (not self.auto): print('(%i) %s' % (i + 1, self.chapters[i])) else: if (self.lastDownloaded == self.chapters[i]): lowerRange = i + 1 self.chapters[i] = ('http://mangafox.me/manga/%s/%s' % (keyword, self.chapters[i]), self.chapters[i], self.chapters[i]) else: for i in range(0, len(self.chapters)): if self.verbose_FLAG: print("%s %s" % (self.chapters[i][0], self.chapters[i][1])) self.chapters[i] = ( 'http://mangafox.me/manga/%s/%s/%s' % (keyword, self.chapters[i][0], self.chapters[i][1]), self.chapters[i][0] + "." + self.chapters[i][1], self.chapters[i][1]) if (not self.auto): print('(%i) %s' % (i + 1, self.chapters[i][1])) else: if (self.lastDownloaded == self.chapters[i][1]): lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) # which ones do we want? if (not self.auto): self.chapters_to_download = self.selectChapters(self.chapters) # XML component else: if (lowerRange == upperRange): raise self.NoUpdates for i in range(lowerRange, upperRange): self.chapters_to_download.append(i) return
def parseSite(self): print("Beginning Batoto check: {}".format(self.manga)) url = "http://www.batoto.net/search?name={}&name_cond=c".format('+'.join(self.manga.split())) s = getSourceCode(url, self.proxy) soup = BeautifulSoup(s) a = soup.find("div", id="comic_search_results") r = a.tbody.find_all("tr")[1:] seriesl = [] try: for i in r: u = i.td.a['href'] t = i.td.a.img.next_sibling[1:] seriesl.append((u,t.encode('utf-8'))) except TypeError: # signifies no manga found raise self.MangaNotFound("Nonexistent.") manga = self.selectFromResults(seriesl) if self.verbose_FLAG: print(manga) mname = [i for i in seriesl if i[0] == manga][0][1] s = getSourceCode(manga, self.proxy) soup = BeautifulSoup(s) t = soup.find("table", class_="chapters_list").tbody cl = t.find_all("tr", class_="lang_English") self.chapters = [[]] cnum = self.chapters[0] for i in cl: u = i.td.a['href'] t = i.td.a.img.next_sibling[1:] g = i.find_all("td")[2].get_text().strip() try: c = float(re.search("ch([\d.]+)", u).group(1)) except AttributeError: c = 0 tu = (u,t,g,c) if len(cnum) == 0 or cnum[0][3] == c: cnum.append(tu) else: self.chapters.append([]) cnum = self.chapters[-1] cnum.append(tu) self.chapters.reverse() sc = None for i in self.chapters: if len(i) == 1 or sc == None: if sc != None and sc[2] != i[0][2]: if self.verbose_FLAG: print("switched to {} at {}".format(i[0][2], i[0][3])) sc = i[0] del i[1:] continue ll = [n for n in i if n[2] == sc[2]] if len(ll) != 1: c = self.get_next_url(sc[0]) i[0] = [n for n in i if n[0] == c][0] if self.verbose_FLAG: print("Anomaly at chapter {} ({} matches, chose {})".format(i[0][3], len(ll), i[0][2])) del i[1:] sc = i[0] continue i[0] = ll[0] sc = i[0] del i[1:] self.chapters = [i[0] for i in self.chapters] for n,c in enumerate(self.chapters): print("{:03d}. {}".format(n+1, c[1].encode('utf-8'))) self.chapters_to_download = self.selectChapters(self.chapters)
def parseSite(self): print("Beginning Batoto check: {0}".format(self.manga)) url = "http://www.batoto.net/search?name={0}&name_cond=c".format( '+'.join(self.manga.split())) s = getSourceCode(url, self.proxy) soup = BeautifulSoup(s) a = soup.find("div", id="comic_search_results") r = a.tbody.find_all("tr")[1:] seriesl = [] for i in r: try: e = i.td.findAll('a')[1] u = e['href'] t = e.img.next_sibling[1:] seriesl.append((u, t.encode('utf-8'))) except: pass if not seriesl: # signifies no manga found raise self.MangaNotFound("Nonexistent.") manga = self.selectFromResults(seriesl) if self.verbose_FLAG: print(manga) mname = [i for i in seriesl if i[0] == manga][0][1] s = getSourceCode(manga, self.proxy) soup = BeautifulSoup(s) t = soup.find("table", class_="chapters_list").tbody cl = t.find_all("tr", class_="lang_English") self.chapters = [[]] cnum = self.chapters[0] for i in cl: u = i.td.a['href'] t = i.td.a.img.next_sibling[1:] g = i.find_all("td")[2].get_text().strip() try: c = float(re.search("ch([\d.]+)", u).group(1)) c = str(int(c)) if c.is_integer() else str(c) except AttributeError: c = 0 tu = (u, t, c, g) if len(cnum) == 0 or cnum[0][3] == c: cnum.append(tu) else: self.chapters.append([]) cnum = self.chapters[-1] cnum.append(tu) self.chapters.reverse() #Look for first chapter that should be downloaded in auto mode lowerRange = 0 if (self.auto): for i in range(0, len(self.chapters)): if (self.lastDownloaded == self.chapters[i][0][1]): lowerRange = i + 1 sc = None for i in self.chapters: if len(i) == 1 or sc == None: if sc != None and sc[2] != i[0][2]: if self.verbose_FLAG: print("switched to {0} at {1}".format( i[0][2], i[0][3])) sc = i[0] del i[1:] continue ll = [n for n in i if n[2] == sc[2]] if len(ll) != 1: c = self.get_next_url(sc[0]) i[0] = [n for n in i if n[0] == c][0] if self.verbose_FLAG: print("Anomaly at chapter {0} ({1} matches, chose {2})". format(i[0][3], len(ll), i[0][2])) del i[1:] sc = i[0] continue i[0] = ll[0] sc = i[0] del i[1:] self.chapters = [i[0] for i in self.chapters] upperRange = len(self.chapters) # which ones do we want? if (not self.auto): for n, c in enumerate(self.chapters): print("{0:03d}. {1}".format(n + 1, c[1].encode('utf-8'))) self.chapters_to_download = self.selectChapters(self.chapters) # XML component else: if (lowerRange == upperRange): raise self.NoUpdates for i in range(lowerRange, upperRange): self.chapters_to_download.append(i) return
def parseSite(self): print("Beginning Batoto check: {}".format(self.manga)) url = "http://www.batoto.net/search?name={}&name_cond=c".format('+'.join(self.manga.split())) s = getSourceCode(url, self.proxy) soup = BeautifulSoup(s) a = soup.find("div", id="comic_search_results") r = a.tbody.find_all("tr")[1:] seriesl = [] for i in r: try: e = i.td.findAll('a')[1] u = e['href'] t = e.img.next_sibling[1:] seriesl.append((u,t.encode('utf-8'))) except: pass if not seriesl: # signifies no manga found raise self.MangaNotFound("Nonexistent.") manga = self.selectFromResults(seriesl) if self.verbose_FLAG: print(manga) mname = [i for i in seriesl if i[0] == manga][0][1] s = getSourceCode(manga, self.proxy) soup = BeautifulSoup(s) t = soup.find("table", class_="chapters_list").tbody cl = t.find_all("tr", class_="lang_English") self.chapters = [[]] cnum = self.chapters[0] for i in cl: u = i.td.a['href'] t = i.td.a.img.next_sibling[1:] g = i.find_all("td")[2].get_text().strip() try: c = float(re.search("ch([\d.]+)", u).group(1)) c = str(int(c)) if c.is_integer() else str(c) except AttributeError: c = 0 tu = (u,t,c,g) if len(cnum) == 0 or cnum[0][3] == c: cnum.append(tu) else: self.chapters.append([]) cnum = self.chapters[-1] cnum.append(tu) self.chapters.reverse() #Look for first chapter that should be downloaded in auto mode lowerRange = 0 if (self.auto): for i in range(0, len(self.chapters)): if (self.lastDownloaded == self.chapters[i][0][1]): lowerRange = i + 1 sc = None for i in self.chapters: if len(i) == 1 or sc == None: if sc != None and sc[2] != i[0][2]: if self.verbose_FLAG: print("switched to {} at {}".format(i[0][2], i[0][3])) sc = i[0] del i[1:] continue ll = [n for n in i if n[2] == sc[2]] if len(ll) != 1: c = self.get_next_url(sc[0]) i[0] = [n for n in i if n[0] == c][0] if self.verbose_FLAG: print("Anomaly at chapter {} ({} matches, chose {})".format(i[0][3], len(ll), i[0][2])) del i[1:] sc = i[0] continue i[0] = ll[0] sc = i[0] del i[1:] self.chapters = [i[0] for i in self.chapters] upperRange = len(self.chapters) # which ones do we want? if (not self.auto): for n,c in enumerate(self.chapters): print("{:03d}. {}".format(n+1, c[1].encode('utf-8'))) self.chapters_to_download = self.selectChapters(self.chapters) # XML component else: if ( lowerRange == upperRange): raise self.NoUpdates for i in range (lowerRange, upperRange): self.chapters_to_download.append(i) return
def parseSite(self): """ Parses list of chapters and URLs associated with each one for the given manga and site. """ print("Beginning MangaFox check: %s" % self.manga) # jump straight to expected URL and test if manga removed url = "http://mangafox.me/manga/%s/" % self.fixFormatting(self.manga) if self.verbose_FLAG: print(url) source, redirectURL = getSourceCode(url, self.proxy, True) if redirectURL != url or source is None or "the page you have requested cannot be found" in source: # Could not find the manga page by guessing # Use the website search url = "http://mangafox.me/search.php?name_method=bw&name=%s&is_completed=&advopts=1" % "+".join( self.manga.split() ) if self.verbose_FLAG: print(url) try: source = getSourceCode(url, self.proxy) seriesResults = [] if source is not None: seriesResults = MangaFox.re_getSeries.findall(source) if 0 == len(seriesResults): url = "http://mangafox.me/search.php?name_method=cw&name=%s&is_completed=&advopts=1" % "+".join( self.manga.split() ) if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) if source is not None: seriesResults = MangaFox.re_getSeries.findall(source) # 0 results except AttributeError: raise self.MangaNotFound("It doesn't exist, or cannot be resolved by autocorrect.") else: keyword = self.selectFromResults(seriesResults) if self.verbose_FLAG: print("Keyword: %s" % keyword) url = "http://mangafox.me/manga/%s/" % keyword if self.verbose_FLAG: print("URL: %s" % url) source = getSourceCode(url, self.proxy) if source is None: raise self.MangaNotFound("Search Failed to find Manga.") else: # The Guess worked keyword = self.fixFormatting(self.manga) if self.verbose_FLAG: print("Keyword: %s" % keyword) if "it is not available in Manga Fox." in source: raise self.MangaNotFound("It has been removed.") # that's nice of them # url = 'http://mangafox.me/cache/manga/%s/chapters.js' % keyword # source = getSourceCode(url, self.proxy) # chapters is a 2-tuple # chapters[0] contains the chapter URL # chapters[1] contains the chapter title isChapterOnly = False # can't pre-compile this because relies on class name re_getChapters = re.compile('a href="http://.*?mangafox.*?/manga/%s/(v[\d]+)/(c[\d]+)/[^"]*?" title' % keyword) self.chapters = re_getChapters.findall(source) if not self.chapters: if self.verbose_FLAG: print("Trying chapter only regex") isChapterOnly = True re_getChapters = re.compile('a href="http://.*?mangafox.*?/manga/%s/(c[\d]+)/[^"]*?" title' % keyword) self.chapters = re_getChapters.findall(source) self.chapters.reverse() # code used to both fix URL from relative to absolute as well as verify last downloaded chapter for XML component lowerRange = 0 if isChapterOnly: for i in range(0, len(self.chapters)): if self.verbose_FLAG: print("%s" % self.chapters[i]) if not self.auto: print("(%i) %s" % (i + 1, self.chapters[i])) else: if self.lastDownloaded == self.chapters[i]: lowerRange = i + 1 self.chapters[i] = ( "http://mangafox.me/manga/%s/%s" % (keyword, self.chapters[i]), self.chapters[i], self.chapters[i], ) else: for i in range(0, len(self.chapters)): if self.verbose_FLAG: print("%s %s" % (self.chapters[i][0], self.chapters[i][1])) self.chapters[i] = ( "http://mangafox.me/manga/%s/%s/%s" % (keyword, self.chapters[i][0], self.chapters[i][1]), self.chapters[i][0] + "." + self.chapters[i][1], self.chapters[i][1], ) if not self.auto: print("(%i) %s" % (i + 1, self.chapters[i][1])) else: if self.lastDownloaded == self.chapters[i][1]: lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) # which ones do we want? if not self.auto: self.chapters_to_download = self.selectChapters(self.chapters) # XML component else: if lowerRange == upperRange: raise self.NoUpdates for i in range(lowerRange, upperRange): self.chapters_to_download.append(i) return
def parseSite(self): """ Parses list of chapters and URLs associated with each one for the given manga and site. """ print('Beginning MangaHere check: %s' % self.manga) # jump straight to expected URL and test if manga removed url = 'http://www.mangahere.com/manga/%s/' % self.fixFormatting( self.manga ) if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) if (source is None or 'the page you have requested can' in source): # do a 'begins-with' search, then a 'contains' search url = 'http://www.mangahere.com/search.php?name=%s' % '+'.join(self.manga.split()) if self.verbose_FLAG: print(url) try: source = getSourceCode(url, self.proxy) if('Sorry you have just searched, please try 5 seconds later.' in source): print('Searched too soon, waiting 5 seconds...') time.sleep(5) seriesResults = MangaHere.re_getSeries.findall(source) seriesResults = [] if source is not None: seriesResults = MangaHere.re_getSeries.findall(source) if (0 == len(seriesResults) ): url = 'http://www.mangahere.com/search.php?name=%s' % '+'.join(self.manga.split()) if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) if source is not None: seriesResults = MangaHere.re_getSeries.findall(source) # 0 results except AttributeError: raise self.MangaNotFound('It doesn\'t exist, or cannot be resolved by autocorrect.') else: keyword = self.selectFromResults(seriesResults) if self.verbose_FLAG: print ("Keyword: %s" % keyword) url = 'http://www.mangahere.com/manga/%s/' % keyword if self.verbose_FLAG: print(url) source = getSourceCode(url, self.proxy) else: # The Guess worked keyword = self.fixFormatting( self.manga ) if self.verbose_FLAG: print ("Keyword: %s" % keyword) # other check for manga removal if our initial guess for the name was wrong if('it is not available in.' in source): raise self.MangaNotFound('It has been removed.') # that's nice of them #url = 'http://www.mangahere.com/cache/manga/%s/chapters.js' % keyword #source = getSourceCode(url, self.proxy) # chapters is a 2-tuple # chapters[0] contains the chapter URL # chapters[1] contains the chapter title isChapterOnly = False # can't pre-compile this because relies on class name re_getChapters = re.compile('a.*?href="http://.*?mangahere.*?/manga/%s/(v[\d]+)/(c[\d]+(\.[\d]+)?)/[^"]*?"' % keyword) self.chapters = re_getChapters.findall(source) if not self.chapters: if self.verbose_FLAG: print ("Trying chapter only regex") isChapterOnly = True re_getChapters = re.compile('a.*?href="http://.*?mangahere.*?/manga/%s/(c[\d]+(\.[\d]+)?)/[^"]*?"' % keyword) self.chapters = re_getChapters.findall(source) #Sort chapters by volume and chapter number. Needed because next chapter isn't always accurate. self.chapters = sorted(self.chapters, cmp=self.chapter_compare) # code used to both fix URL from relative to absolute as well as verify last downloaded chapter for XML component lowerRange = 0 if isChapterOnly: for i in range(0, len(self.chapters)): if self.verbose_FLAG: print("%s" % self.chapters[i][0]) if (self.auto): if (self.lastDownloaded == self.chapters[i][0]): lowerRange = i + 1 self.chapters[i] = ('http://www.mangahere.com/manga/%s/%s' % (keyword, self.chapters[i][0]), self.chapters[i][0], self.chapters[i][0]) else: for i in range(0, len(self.chapters)): if self.verbose_FLAG: print("%s %s" % (self.chapters[i][0], self.chapters[i][1])) self.chapters[i] = ('http://www.mangahere.com/manga/%s/%s/%s' % (keyword, self.chapters[i][0], self.chapters[i][1]), self.chapters[i][0] + "." + self.chapters[i][1], self.chapters[i][1]) if (self.auto): if (self.lastDownloaded == self.chapters[i][1]): lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) # Validate whether the last chapter is a if (self.verbose_FLAG): print(self.chapters[upperRange - 1]) print("Validating chapter: %s" % self.chapters[upperRange - 1][0]) source = getSourceCode(self.chapters[upperRange - 1][0], self.proxy) if ('not available yet' in source) or ('Sorry, the page you have requested can’t be found' in source): # If the last chapter is not available remove it from the list del self.chapters[upperRange - 1] upperRange = upperRange - 1; # which ones do we want? if (not self.auto): for i in range(0, upperRange): if isChapterOnly: print('(%i) %s' % (i + 1, self.chapters[i][0])) else: print('(%i) %s' % (i + 1, self.chapters[i][1])) self.chapters_to_download = self.selectChapters(self.chapters) # XML component else: if ( lowerRange == upperRange): raise self.NoUpdates for i in range (lowerRange, upperRange): self.chapters_to_download.append(i) return
def parseSite(self): """ Parses list of chapters and URLs associated with each one for the given manga and site. """ print('Beginning MangaFox check: %s' % self.manga) # jump straight to expected URL and test if manga removed url = 'http://www.mangafox.com/manga/%s/' % self.fixFormatting( self.manga ) if self.verbose_FLAG: print(url) source = getSourceCode(url) if('it is not available in Manga Fox.' in source): raise self.MangaNotFound('It has been removed.') # do a 'begins-with' search, then a 'contains' search url = 'http://www.mangafox.com/search.php?name_method=bw&name=%s' % '+'.join(self.manga.split()) if self.verbose_FLAG: print(url) try: source = getSourceCode(url) seriesResults = MangaFox.re_getSeries.findall(source) if (0 == len(seriesResults) ): url = 'http://www.mangafox.com/search.php?name=%s' % '+'.join(self.manga.split()) if self.verbose_FLAG: print(url) source = getSourceCode(url) seriesResults = MangaFox.re_getSeries.findall(source) # 0 results except AttributeError: raise self.MangaNotFound('It doesn\'t exist, or cannot be resolved by autocorrect.') else: keyword = self.selectFromResults(seriesResults) if self.verbose_FLAG: print ("Keyword: %s" % keyword) url = 'http://www.mangafox.com/manga/%s/' % keyword source = getSourceCode(url) # other check for manga removal if our initial guess for the name was wrong if('it is not available in Manga Fox.' in source): raise self.MangaNotFound('It has been removed.') # that's nice of them #url = 'http://www.mangafox.com/cache/manga/%s/chapters.js' % keyword #source = getSourceCode(url) # chapters is a 2-tuple # chapters[0] contains the chapter URL # chapters[1] contains the chapter title # can't pre-compile this because relies on class name re_getChapters = re.compile('a href="http://.*?mangafox.*?/manga/%s/(v[\d]+)/(c[\d]+)/[^"]*?" title' % keyword) self.chapters = re_getChapters.findall(source) self.chapters.reverse() # code used to both fix URL from relative to absolute as well as verify last downloaded chapter for XML component lowerRange = 0 for i in range(0, len(self.chapters)): #print("%s %s" % (self.chapters[i][0], self.chapters[i][1])) self.chapters[i] = ('http://www.mangafox.com/manga/%s/%s/%s' % (keyword, self.chapters[i][0], self.chapters[i][1]), self.chapters[i][0] + "." + self.chapters[i][1], self.chapters[i][1]) if (not self.auto): print('(%i) %s' % (i + 1, self.chapters[i][1])) else: if (self.lastDownloaded == self.chapters[i][1]): lowerRange = i + 1 # this might need to be len(self.chapters) + 1, I'm unsure as to whether python adds +1 to i after the loop or not upperRange = len(self.chapters) # which ones do we want? if (not self.auto): self.chapters_to_download = self.selectChapters(self.chapters) # XML component else: if ( lowerRange == upperRange): raise self.NoUpdates for i in range (lowerRange, upperRange): self.chapters_to_download.append(i) return