def getImages(self, baseUrl): with self.wg.chromiumContext() as cr: resp = cr.blocking_navigate_and_get_source(baseUrl) pgctnt = self.check_recaptcha(pgurl=baseUrl, markup=resp['content']) linkRe = re.compile(r'lstImages\.push\((wrapKA\(".+?"\))\);') links = linkRe.findall(pgctnt) pages = [] for item in links: resp_asm = cr.execute_javascript("function() { return %s; }" % item, returnByValue=True) # This is horrible. tgt = resp_asm['result']['result']['value']['value'] if not tgt.startswith("http"): raise ScrapeExceptions.LimitedException( "URL Decryption failed!") pages.append(tgt) self.log.info("Found %s pages", len(pages)) self.wg._syncOutOfChromium(cr) images = [] for imgUrl in pages: imageName, imageContent = self.getImage(imgUrl) images.append((imageName, imageContent)) return images
def getImageUrls(self, baseUrl): pages = set() soup = self.wg.getSoup(baseUrl) imagesDiv = soup.find('div', class_='chapterPages') if not imagesDiv: if soup.find("div", class_='primaryContent') and soup.find( "div", class_='primaryContent').find( 'div', class_='messageContent'): raise ScrapeExceptions.NotMangaException( "This item appears to be a Light-Novel!") images = imagesDiv.find_all('img', class_='avatar') pageno = 1 for image in images: src = image['src'] if "pagespeed" in src: scheme, netloc, path, query, fragment = urllib.parse.urlsplit( src) root, filename = os.path.split(path) filename = filename.split(".pagespeed.")[0] if filename.startswith("x"): filename = filename[1:] path = os.path.join(root, filename) src = urllib.parse.urlunsplit( (scheme, netloc, path, query, fragment)) pages.add((pageno, src)) pageno += 1 pages = list(pages) pages.sort() return pages
def check_recaptcha(self, pgurl, soup=None, markup=None): if markup: soup = WebRequest.as_soup(markup) if not soup: raise RuntimeError( "You have to pass either the raw page markup, or a pre-parsed bs4 soup object!" ) capdiv = soup.find("div", class_='g-recaptcha') if not capdiv: if markup: return markup return soup raise ScrapeExceptions.LimitedException( "Encountered ReCaptcha! Cannot circumvent!") self.log.warning("Found ReCaptcha div. Need to circumvent.") sitekey = capdiv['data-sitekey'] # soup.find("") params = { 'key': settings.captcha_solvers['2captcha']['api_key'], 'method': 'userrecaptcha', 'googlekey': sitekey, 'pageurl': pgurl, 'json': 1, } # self.wg.getJson("https://2captcha.com/in.php", postData=params) # # here we post site key to 2captcha to get captcha ID (and we parse it here too) # captcha_id = s.post("?key={}&method=userrecaptcha&googlekey={}&pageurl={}".format(API_KEY, site_key, url), proxies=proxy).text.split('|')[1] # # then we parse gresponse from 2captcha response # recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id), proxies=proxy).text # print("solving ref captcha...") # while 'CAPCHA_NOT_READY' in recaptcha_answer: # sleep(5) # recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id), proxies=proxy).text # recaptcha_answer = recaptcha_answer.split('|')[1] # # we make the payload for the post data here, use something like mitmproxy or fiddler to see what is needed # payload = { # 'key': 'value', # 'gresponse': recaptcha_answer # This is the response from 2captcha, which is needed for the post request to go through. # } resolved = { "reUrl": "/Manga/Love-Lab-MIYAHARA-Ruri/Vol-010-Ch-001?id=359632", "g-recaptcha-response": "03AOP2lf5kLccgf5aAkMmzXR8mN6Kv6s76BoqHIv-raSzGCa98HMPMdx0n04ourhM1mBApnesMRbzr2vFa0264mY83SCkL5slCFcC-i3uWJoHIjVhGh0GN4yyswg5-yZpDg1iK882nPuxEeaxb18pOK790x4Z18ib5UOPGU-NoECVb6LS03S3b4fCjWwRDLNF43WhkHDFd7k-Os7ULCgOZe_7kcF9xbKkovCh2uuK0ytD7rhiKnZUUvl1TimGsSaFkSSrQ1C4cxZchVXrz7kIx0r6Qp2hPr2_PW0CAutCkmr9lt9TS5n0ecdVFhdVQBniSB-NZv9QEpbQ8", }
def getDownloadInfo(self, link_row_id): with self.row_context(dbid=link_row_id) as row: source_url = row.source_id row.state = 'fetching' self.log.info("Retrieving item: %s", source_url) try: soup = self.wg.getSoup(source_url, addlHeaders={'Referer': self.urlBase}) except Exception as e: self.log.critical("No download at url %s! Dbid = %s", source_url, link_row_id) for line in traceback.format_exc().split("\n"): self.log.critical("" + line) raise ScrapeExceptions.UnwantedContentError("Item missing?") if "This gallery has been removed, and is unavailable." in soup.get_text( ): self.log.info("Gallery deleted. Removing.") raise ScrapeExceptions.UnwantedContentError("Item missing?") item_tags = self.getTags(soup) if not item_tags: self.log.info("No tags. Removing.") raise ScrapeExceptions.UnwantedContentError("Item missing?") # self.addTags(sourceUrl=sourceUrl, tags=tags) # return True ret = { 'dlPage': self.getDownloadPageUrl(soup), 'item_tags': item_tags, } return ret
def getImageUrls(self, baseUrl): pgctnt, filename, mimetype = self.wg.getItemPhantomJS(baseUrl) pgctnt = self.check_recaptcha(pgurl=baseUrl, markup=pgctnt) linkRe = re.compile(r'lstImages\.push\((wrapKA\(".+?"\))\);') links = linkRe.findall(pgctnt) pages = [] for item in links: tgt = self.wg.pjs_driver.execute_script("return %s" % item) if not tgt.startswith("http"): raise ScrapeExceptions.LimitedException( "URL Decryption failed!") pages.append(tgt) self.log.info("Found %s pages", len(pages)) return pages