コード例 #1
0
ファイル: ContentLoader.py プロジェクト: ryuzakace/MangaCMS
    def getImages(self, baseUrl):

        with self.wg.chromiumContext() as cr:

            resp = cr.blocking_navigate_and_get_source(baseUrl)
            pgctnt = self.check_recaptcha(pgurl=baseUrl,
                                          markup=resp['content'])
            linkRe = re.compile(r'lstImages\.push\((wrapKA\(".+?"\))\);')
            links = linkRe.findall(pgctnt)

            pages = []
            for item in links:
                resp_asm = cr.execute_javascript("function() { return %s; }" %
                                                 item,
                                                 returnByValue=True)

                # This is horrible.
                tgt = resp_asm['result']['result']['value']['value']

                if not tgt.startswith("http"):
                    raise ScrapeExceptions.LimitedException(
                        "URL Decryption failed!")
                pages.append(tgt)

            self.log.info("Found %s pages", len(pages))

            self.wg._syncOutOfChromium(cr)

            images = []
            for imgUrl in pages:
                imageName, imageContent = self.getImage(imgUrl)
                images.append((imageName, imageContent))
        return images
コード例 #2
0
    def getImageUrls(self, baseUrl):
        pages = set()

        soup = self.wg.getSoup(baseUrl)

        imagesDiv = soup.find('div', class_='chapterPages')
        if not imagesDiv:
            if soup.find("div", class_='primaryContent') and soup.find(
                    "div", class_='primaryContent').find(
                        'div', class_='messageContent'):
                raise ScrapeExceptions.NotMangaException(
                    "This item appears to be a Light-Novel!")
        images = imagesDiv.find_all('img', class_='avatar')

        pageno = 1
        for image in images:
            src = image['src']
            if "pagespeed" in src:
                scheme, netloc, path, query, fragment = urllib.parse.urlsplit(
                    src)
                root, filename = os.path.split(path)
                filename = filename.split(".pagespeed.")[0]
                if filename.startswith("x"):
                    filename = filename[1:]
                path = os.path.join(root, filename)
                src = urllib.parse.urlunsplit(
                    (scheme, netloc, path, query, fragment))

            pages.add((pageno, src))
            pageno += 1

        pages = list(pages)
        pages.sort()
        return pages
コード例 #3
0
ファイル: ContentLoader.py プロジェクト: ryuzakace/MangaCMS
    def check_recaptcha(self, pgurl, soup=None, markup=None):
        if markup:
            soup = WebRequest.as_soup(markup)
        if not soup:
            raise RuntimeError(
                "You have to pass either the raw page markup, or a pre-parsed bs4 soup object!"
            )

        capdiv = soup.find("div", class_='g-recaptcha')
        if not capdiv:
            if markup:
                return markup
            return soup

        raise ScrapeExceptions.LimitedException(
            "Encountered ReCaptcha! Cannot circumvent!")

        self.log.warning("Found ReCaptcha div. Need to circumvent.")
        sitekey = capdiv['data-sitekey']

        # soup.find("")

        params = {
            'key': settings.captcha_solvers['2captcha']['api_key'],
            'method': 'userrecaptcha',
            'googlekey': sitekey,
            'pageurl': pgurl,
            'json': 1,
        }

        # self.wg.getJson("https://2captcha.com/in.php", postData=params)

        # # here we post site key to 2captcha to get captcha ID (and we parse it here too)
        # captcha_id = s.post("?key={}&method=userrecaptcha&googlekey={}&pageurl={}".format(API_KEY, site_key, url), proxies=proxy).text.split('|')[1]

        # # then we parse gresponse from 2captcha response
        # recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id), proxies=proxy).text
        # print("solving ref captcha...")
        # while 'CAPCHA_NOT_READY' in recaptcha_answer:
        # 	sleep(5)
        # 	recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id), proxies=proxy).text
        # recaptcha_answer = recaptcha_answer.split('|')[1]

        # # we make the payload for the post data here, use something like mitmproxy or fiddler to see what is needed
        # payload = {
        # 	'key': 'value',
        # 	'gresponse': recaptcha_answer  # This is the response from 2captcha, which is needed for the post request to go through.
        # 	}

        resolved = {
            "reUrl":
            "/Manga/Love-Lab-MIYAHARA-Ruri/Vol-010-Ch-001?id=359632",
            "g-recaptcha-response":
            "03AOP2lf5kLccgf5aAkMmzXR8mN6Kv6s76BoqHIv-raSzGCa98HMPMdx0n04ourhM1mBApnesMRbzr2vFa0264mY83SCkL5slCFcC-i3uWJoHIjVhGh0GN4yyswg5-yZpDg1iK882nPuxEeaxb18pOK790x4Z18ib5UOPGU-NoECVb6LS03S3b4fCjWwRDLNF43WhkHDFd7k-Os7ULCgOZe_7kcF9xbKkovCh2uuK0ytD7rhiKnZUUvl1TimGsSaFkSSrQ1C4cxZchVXrz7kIx0r6Qp2hPr2_PW0CAutCkmr9lt9TS5n0ecdVFhdVQBniSB-NZv9QEpbQ8",
        }
コード例 #4
0
    def getDownloadInfo(self, link_row_id):

        with self.row_context(dbid=link_row_id) as row:
            source_url = row.source_id
            row.state = 'fetching'

        self.log.info("Retrieving item: %s", source_url)

        try:
            soup = self.wg.getSoup(source_url,
                                   addlHeaders={'Referer': self.urlBase})

        except Exception as e:
            self.log.critical("No download at url %s! Dbid = %s", source_url,
                              link_row_id)
            for line in traceback.format_exc().split("\n"):
                self.log.critical("" + line)

            raise ScrapeExceptions.UnwantedContentError("Item missing?")

        if "This gallery has been removed, and is unavailable." in soup.get_text(
        ):
            self.log.info("Gallery deleted. Removing.")
            raise ScrapeExceptions.UnwantedContentError("Item missing?")

        item_tags = self.getTags(soup)
        if not item_tags:
            self.log.info("No tags. Removing.")
            raise ScrapeExceptions.UnwantedContentError("Item missing?")

        # self.addTags(sourceUrl=sourceUrl, tags=tags)
        # return True

        ret = {
            'dlPage': self.getDownloadPageUrl(soup),
            'item_tags': item_tags,
        }

        return ret
コード例 #5
0
ファイル: ContentLoader.py プロジェクト: cbunch/MangaCMS
    def getImageUrls(self, baseUrl):

        pgctnt, filename, mimetype = self.wg.getItemPhantomJS(baseUrl)

        pgctnt = self.check_recaptcha(pgurl=baseUrl, markup=pgctnt)

        linkRe = re.compile(r'lstImages\.push\((wrapKA\(".+?"\))\);')

        links = linkRe.findall(pgctnt)

        pages = []
        for item in links:
            tgt = self.wg.pjs_driver.execute_script("return %s" % item)
            if not tgt.startswith("http"):
                raise ScrapeExceptions.LimitedException(
                    "URL Decryption failed!")
            pages.append(tgt)

        self.log.info("Found %s pages", len(pages))

        return pages