Example #1
0
 def __init__(self):
     self.log = LogHandler()
     self.set_view_port_option = {'width': 1920, 'height': 1080}
     self.goto_option = {
         'waitUntil': 'networkidle2',
     }
     self.pdf_option = {
         'width': 1920,
         'height': 1080,
         'format': 'A4',
     }
Example #2
0
class Spider(object):
    def __init__(self):
        self.log = LogHandler()
        self.set_view_port_option = {'width': 1920, 'height': 1080}
        self.goto_option = {
            'waitUntil': 'networkidle2',
        }
        self.pdf_option = {
            'width': 1920,
            'height': 1080,
            'format': 'A1',
        }

    async def spider(self, url):
        self.log.detail_info(f"Start crawl {url}")
        self.log.detail_info(f"{url} started at {time.strftime('%X')}")
        # Handle Error: pyppeteer.errors.NetworkError: Protocol error Runtime.callFunctionOn: Target closed.
        # Issue fix #1 by C1tas
        try:
            browser = await launch()
            page = await browser.newPage()
            await page.setViewport(self.set_view_port_option)
            await page.goto(url, self.goto_option)
            await scroll_page(page)
            pdf = await page.pdf(self.pdf_option)
            title = await page.title()
            filename = await self.translate_word(title)
            await browser.close()
            self.log.detail_info(f"{url} finished at {time.strftime('%X')}")
            return filename, pdf
        except Exception as exc:
            self.log.error_then(
                f"Occur an unexpected error when crawl {url}: {str(exc)}!")
            self.log.detail_info(f"{url} finished at {time.strftime('%X')}")
        finally:
            await browser.close()

    @staticmethod
    async def translate_word(word):
        table = {
            ord(f): ord(t)
            for f, t in zip(u',。!?【】()/%#@&1234567890',
                            u',.!?[]()-%#@&1234567890')
        }
        return word.translate(table)
Example #3
0
class Spider(object):
    def __init__(self):
        self.log = LogHandler()
        self.set_view_port_option = {'width': 1920, 'height': 1080}
        self.goto_option = {
            'waitUntil': 'networkidle2',
        }
        self.pdf_option = {
            'width': 1920,
            'height': 1080,
            'format': 'A4',
        }

    async def spider(self, url):
        self.log.detail_info(f"[*] Start crawl {url}")
        self.log.detail_info(f"[*] {url} started at {time.strftime('%X')}")
        browser = await launch()
        page = await browser.newPage()
        await page.setViewport(self.set_view_port_option)
        await page.goto(url, self.goto_option)
        title = await page.title()
        filename = await self.translate_word(title)
        await page.evaluate(scroll_page_js)
        pdf = await page.pdf(self.pdf_option)
        await browser.close()
        self.log.detail_info(f"[*] {url} finished at {time.strftime('%X')}")
        return filename, pdf

    async def translate_word(self, word):
        table = {
            ord(f): ord(t)
            for f, t in zip(u',。!?【】()/%#@&1234567890',
                            u',.!?[]()-%#@&1234567890')
        }
        return word.translate(table)
Example #4
0
 def __init__(self):
     self.log = LogHandler()
     self.worker = Worker()
     self.check = Check()
Example #5
0
 def __init__(self):
     self.log = LogHandler()
Example #6
0
 def __init__(self):
     self.log = LogHandler()
     self.worker = CommonWorker()
Example #7
0
class Check(object):
    def __init__(self):
        self.log = LogHandler()
        self.worker = Worker()

    async def check_authorization_code(self):
        if os.path.exists('authorization_code.json'):
            self.log.detail_info("[*] File: authorization_code.json is exist!")
            try:
                with open('authorization_code.json', 'r') as f:
                    token = json.load(f)
                if token['authorization'] != "":
                    self.log.success_info("[+] Success get authorization!")
                    return token['authorization']
                else:
                    self.log.error_then(
                        "[-] The authorization_code.json is not correct, deleting the file..."
                    )
            except Exception as exc:
                self.log.error_then(
                    "[-] The authorization_code.json is not correct, deleting the file..."
                )
            try:
                os.remove('authorization_code.json')
                self.log.success_info(
                    "[+] Delete authorization_code.json successful!")
            except Exception as exc:
                self.log.error_info("[-] " + str(exc))
        else:
            self.log.error_then("[-] Not find authorization_code.json")
        return await self.worker.get_token_worker()
Example #8
0
class Worker(object):
    def __init__(self):
        self.log = LogHandler()

    async def get_token_worker(self):
        if DROPBOX_AUTHORIZATION_CODE == "":
            self.log.error_info(
                f"[-] You don\'t set DROPBOX_ACCESS_TOKEN in config.py,\n"
                f"Please login https://www.DropBox.com/oauth2/authorize?"
                f"client_id={client_id}&response_type=code "
                f"to get an ACCESS_TOKEN")

        self.log.detail_info(
            "[*] Trying to get authorization-token information")

        authorization = Authorization(
            authorization_code=DROPBOX_AUTHORIZATION_CODE)
        try:
            token_json = json.loads(await authorization.get_token())
            token_json.setdefault(
                'authorization', token_json['token_type'].capitalize() + " " +
                token_json['access_token'])
            with open('authorization_code.json', 'x') as f:
                json.dump(token_json, f)
            self.log.success_info("[+] File: authorization_code.json created.")
            return token_json['authorization']
        except KeyError:
            self.log.error_info(
                f"[-] Your DROPBOX_ACCESS_TOKEN in config.py has been used! "
                f"Try another DROPBOX_ACCESS_TOKEN from https://www.DropBox.com/oauth2/authorize?"
                f"client_id={client_id}&response_type=code")

    async def upload_worker(self, authorization_token, file_buffer, tag):
        if file_buffer == "":
            self.log.error_info(
                "[-] You don\'t have any file buffer to upload!")

        if tag == "":
            self.log.error_info(
                "[-] You don\'t set an certain tag which confirm a path on dropbox!"
            )

        self.log.detail_info("[*] Start upload files")

        uplaod = Upload(authorization_token=authorization_token)
        upload_json = json.loads(await uplaod.upload(file_buffer=file_buffer,
                                                     tag=tag))
        self.log.success_info(
            f"[+] File: {upload_json['name']} upload successful.")

    async def spider_worker(self, url):
        if url == "":
            self.log.error_info("[-] You don\'t set an url to spider!")

        spider = Spider()
        file_name, pdf_buffer = await spider.spider(url)
        self.log.success_info(f"[+] {url} crawl successful.")
        return file_name, pdf_buffer