def __init__(self): self.log = LogHandler() self.set_view_port_option = {'width': 1920, 'height': 1080} self.goto_option = { 'waitUntil': 'networkidle2', } self.pdf_option = { 'width': 1920, 'height': 1080, 'format': 'A4', }
class Spider(object): def __init__(self): self.log = LogHandler() self.set_view_port_option = {'width': 1920, 'height': 1080} self.goto_option = { 'waitUntil': 'networkidle2', } self.pdf_option = { 'width': 1920, 'height': 1080, 'format': 'A1', } async def spider(self, url): self.log.detail_info(f"Start crawl {url}") self.log.detail_info(f"{url} started at {time.strftime('%X')}") # Handle Error: pyppeteer.errors.NetworkError: Protocol error Runtime.callFunctionOn: Target closed. # Issue fix #1 by C1tas try: browser = await launch() page = await browser.newPage() await page.setViewport(self.set_view_port_option) await page.goto(url, self.goto_option) await scroll_page(page) pdf = await page.pdf(self.pdf_option) title = await page.title() filename = await self.translate_word(title) await browser.close() self.log.detail_info(f"{url} finished at {time.strftime('%X')}") return filename, pdf except Exception as exc: self.log.error_then( f"Occur an unexpected error when crawl {url}: {str(exc)}!") self.log.detail_info(f"{url} finished at {time.strftime('%X')}") finally: await browser.close() @staticmethod async def translate_word(word): table = { ord(f): ord(t) for f, t in zip(u',。!?【】()/%#@&1234567890', u',.!?[]()-%#@&1234567890') } return word.translate(table)
class Spider(object): def __init__(self): self.log = LogHandler() self.set_view_port_option = {'width': 1920, 'height': 1080} self.goto_option = { 'waitUntil': 'networkidle2', } self.pdf_option = { 'width': 1920, 'height': 1080, 'format': 'A4', } async def spider(self, url): self.log.detail_info(f"[*] Start crawl {url}") self.log.detail_info(f"[*] {url} started at {time.strftime('%X')}") browser = await launch() page = await browser.newPage() await page.setViewport(self.set_view_port_option) await page.goto(url, self.goto_option) title = await page.title() filename = await self.translate_word(title) await page.evaluate(scroll_page_js) pdf = await page.pdf(self.pdf_option) await browser.close() self.log.detail_info(f"[*] {url} finished at {time.strftime('%X')}") return filename, pdf async def translate_word(self, word): table = { ord(f): ord(t) for f, t in zip(u',。!?【】()/%#@&1234567890', u',.!?[]()-%#@&1234567890') } return word.translate(table)
def __init__(self): self.log = LogHandler() self.worker = Worker() self.check = Check()
def __init__(self): self.log = LogHandler()
def __init__(self): self.log = LogHandler() self.worker = CommonWorker()
class Check(object): def __init__(self): self.log = LogHandler() self.worker = Worker() async def check_authorization_code(self): if os.path.exists('authorization_code.json'): self.log.detail_info("[*] File: authorization_code.json is exist!") try: with open('authorization_code.json', 'r') as f: token = json.load(f) if token['authorization'] != "": self.log.success_info("[+] Success get authorization!") return token['authorization'] else: self.log.error_then( "[-] The authorization_code.json is not correct, deleting the file..." ) except Exception as exc: self.log.error_then( "[-] The authorization_code.json is not correct, deleting the file..." ) try: os.remove('authorization_code.json') self.log.success_info( "[+] Delete authorization_code.json successful!") except Exception as exc: self.log.error_info("[-] " + str(exc)) else: self.log.error_then("[-] Not find authorization_code.json") return await self.worker.get_token_worker()
class Worker(object): def __init__(self): self.log = LogHandler() async def get_token_worker(self): if DROPBOX_AUTHORIZATION_CODE == "": self.log.error_info( f"[-] You don\'t set DROPBOX_ACCESS_TOKEN in config.py,\n" f"Please login https://www.DropBox.com/oauth2/authorize?" f"client_id={client_id}&response_type=code " f"to get an ACCESS_TOKEN") self.log.detail_info( "[*] Trying to get authorization-token information") authorization = Authorization( authorization_code=DROPBOX_AUTHORIZATION_CODE) try: token_json = json.loads(await authorization.get_token()) token_json.setdefault( 'authorization', token_json['token_type'].capitalize() + " " + token_json['access_token']) with open('authorization_code.json', 'x') as f: json.dump(token_json, f) self.log.success_info("[+] File: authorization_code.json created.") return token_json['authorization'] except KeyError: self.log.error_info( f"[-] Your DROPBOX_ACCESS_TOKEN in config.py has been used! " f"Try another DROPBOX_ACCESS_TOKEN from https://www.DropBox.com/oauth2/authorize?" f"client_id={client_id}&response_type=code") async def upload_worker(self, authorization_token, file_buffer, tag): if file_buffer == "": self.log.error_info( "[-] You don\'t have any file buffer to upload!") if tag == "": self.log.error_info( "[-] You don\'t set an certain tag which confirm a path on dropbox!" ) self.log.detail_info("[*] Start upload files") uplaod = Upload(authorization_token=authorization_token) upload_json = json.loads(await uplaod.upload(file_buffer=file_buffer, tag=tag)) self.log.success_info( f"[+] File: {upload_json['name']} upload successful.") async def spider_worker(self, url): if url == "": self.log.error_info("[-] You don\'t set an url to spider!") spider = Spider() file_name, pdf_buffer = await spider.spider(url) self.log.success_info(f"[+] {url} crawl successful.") return file_name, pdf_buffer