def download(url): options = EdgeOptions() options.use_chromium =True # option = webdriver.ChromeOptions() # option.add_argument('headless') options.add_argument('log-level=3') driver = Edge(options=options) # driver = webdriver.Chrome( # executable_path='.//chromedriver', chrome_options=option) title = "output" try: driver.set_page_load_timeout(15) driver.get(url) title = driver.title except: print("Timeout - start download anyway.") print(f'道客巴巴: 《{title}》') time.sleep(5) try: # 展开全部 elem_cont_button = driver.find_element_by_id("continueButton") driver.execute_script( "arguments[0].scrollIntoView(true);", elem_cont_button) actions = ActionChains(driver) actions.move_to_element(elem_cont_button).perform() time.sleep(0.5) elem_cont_button.click() except NoSuchElementException: pass # 获取页数 num_of_pages = driver.find_element_by_id('readshop').find_element_by_class_name( 'mainpart').find_element_by_class_name('shop3').find_element_by_class_name('text').get_attribute('innerHTML') num_of_pages = int(num_of_pages.split(' ')[-1]) for i in range(5): # 缩放 driver.find_element_by_id('zoomInButton').click() time.sleep(0.5) if os.path.exists(f'./temp/{title}'): shutil.rmtree(f'./temp/{title}') os.makedirs(f'./temp/{title}') for pages in trange(num_of_pages): time.sleep(0.5) canvas_id = "page_" + str(pages + 1) pagepb_id = "pagepb_" + str(pages + 1) element = driver.find_element_by_id(canvas_id) driver.execute_script("arguments[0].scrollIntoView(true);", element) actions = ActionChains(driver) actions.move_to_element(element).perform() time.sleep(0.5) # Check loading status while(len(driver.find_element_by_id(pagepb_id).get_attribute('innerHTML')) != 0): time.sleep(1) # print(driver.find_element_by_id( # pagepb_id).get_attribute('innerHTML')) js_cmd = "var canvas = document.getElementById('{}');".format(canvas_id) + \ "return canvas.toDataURL();" img_data = driver.execute_script(js_cmd) img_data = (img_data[22:]).encode() with open(f"./temp/{title}/{pages}.png", "wb") as fh: fh.write(base64.decodebytes(img_data)) driver.quit() print('下载完毕,正在转码') conpdf(f'output/{title}.pdf', f'temp/{title}', '.png')
class AbstractDriver(): driver = None browser = None driverPath = None driverFolder = Path.cwd() / "driver" driverInstalledBool = False headless = False userAgent = None announcer = None pathStatusStream = None driverStatusStream = None def __init__(self, announcer): self.pathStatusStream = messageAnnouncer.MessageAnnouncer() self.driverStatusStream = messageAnnouncer.MessageAnnouncer() self.announcer = announcer def getDriverPathStatus(self): x = threading.Thread( target=self._getDriverPathStatus ) x.start() def _getDriverPathStatus(self): respDict = { "status": 0, "eventSourceUrl": "/admin/stream/getDriverPathStatus", "title": "Driver name", "headerBadge": { "caption": "", "content": "", }, "action": [ { "name": "Download driver", "actionUrl": "http://localhost:5000/admin/driver/" "Edg?headless=false", "enabled": True } ], "bodyBadge": { "caption": "", "content": "", }, } errorIcon = '<i class="material-icons">error</i>' if self.driverPath is None: self.getDriverPath(self.driverFolder, None) if self.driverInstalledBool is False: respDict["status"] = 1 respDict["headerBadge"]["caption"] = "Driver not installed" respDict["headerBadge"]["content"] = errorIcon respDict["bodyBadge"]["caption"] = "Driver not installed" respDict["bodyBadge"]["content"] = errorIcon msgText = json.dumps( respDict, default=str ) else: respDict["status"] = 0 respDict["headerBadge"]["caption"] = "driver" respDict["headerBadge"]["content"] = str(self.driverPath.stem) respDict["action"][0]["enabled"] = False msgText = json.dumps( respDict, default=str ) else: respDict["status"] = 0 respDict["headerBadge"]["caption"] = "driver" respDict["headerBadge"]["content"] = str(self.driverPath.stem) respDict["action"][0]["enabled"] = False msgText = json.dumps( respDict, default=str ) self.pathStatusStream.announce( self.pathStatusStream.format_sse(msgText) ) def getDriverStatus(self): x = threading.Thread( target=self._getDriverStatus ) x.start() def _getDriverStatus(self): respDict = { "status": 0, "eventSourceUrl": "/admin/stream/getDriverStatus", "title": "Driver status", "headerBadge": { "caption": "", "content": "", }, "action": [ { "name": "Start driver", "actionUrl": "http://localhost:5000/admin/startDriver", "enabled": True } ], "bodyBadge": { "caption": "", "content": "", }, } errorIcon = '<i class="material-icons">error</i>' try: self.driver.window_handles except WebDriverException as e: respDict["status"] = 1 respDict["headerBadge"]["caption"] = "Not started" respDict["headerBadge"]["content"] = errorIcon respDict["bodyBadge"]["caption"] = str(e) respDict["bodyBadge"]["content"] = errorIcon msgText = json.dumps( respDict, default=str ) except AttributeError as e: respDict["status"] = 1 respDict["headerBadge"]["caption"] = "Not started" respDict["headerBadge"]["content"] = errorIcon respDict["bodyBadge"]["caption"] = str(e) respDict["bodyBadge"]["content"] = errorIcon msgText = json.dumps( respDict, default=str ) else: respDict["status"] = 0 respDict["headerBadge"]["caption"] = "instance" respDict["headerBadge"]["content"] = \ str(self.driver.window_handles[0]) respDict["action"][0]["enabled"] = False msgText = json.dumps( respDict, default=str ) finally: self.driverStatusStream.announce( self.driverStatusStream.format_sse(msgText) ) def checkDriver(self): if self.driver is None: print("Driver not started") print("Starting programatically") print("Assuming you installed only required drivers") _, self.driverPath = self.getDriverPath(self.driverFolder, None) if self.driverPath.name == "msedgedriver.exe": self.browser = "Edg" elif self.driverPath.name == "chromedriver.exe": self.browser = "Chrome" else: print("Browser not supported yet") # make browser headless or not self.createDriver(self.browser, self.driverPath, False) else: try: self.driver.execute(Command.STATUS) except MaxRetryError: self.createDriver(self.browser, self.driverPath, False) def getDriver(self): self.checkDriver() return self.driver def getDriverPath(self, driverFolder, browser=None): """ Check if driver is installed and returns path Args: driverFolder (Path): Pathlib path to driver folder browser (string, optional): Browser type. Defaults to None. Returns: driverInstalledBool (bool): True if driver was found driverPath (Path): Driver + driver name path """ for driverPath in list(driverFolder.glob('**/*.exe')): if browser is not None: if browser.lower() in driverPath.name: self.driverInstalledBool = True self.driverPath = driverPath else: self.driverInstalledBool = True self.driverPath = driverPath return self.driverInstalledBool, self.driverPath def downloadDriver(self, browser, headlessStr, userAgent): self.browser = browser if headlessStr.lower() == "true": self.headless = True else: self.headless = False self.userAgent = userAgent x = threading.Thread( target=self._downloadDriver ) x.start() def _downloadDriver(self): """ Creates selenium driver for webscrapping automation Downloads it into driver folder if not installed """ if not self.driverFolder.exists(): os.mkdir("driver") msgTxt = "User agent: " + self.userAgent + "<br>" self.announcer.announce(self.announcer.format_sse(msgTxt)) for browserVersion in self.userAgent.split(" "): if browserVersion.split("/")[0] == self.browser: version = browserVersion.split("/")[1] if len(version) == 0: # output += "Browser not found, options are - # Mozilla, # AppleWebKit, # Chrome, # Safari, # Edg msgTxt = "Error: Browser not found, options are - Chrome, Edg <br>" self.announcer.announce(self.announcer.format_sse(msgTxt)) # get driver path self.driverInstalledBool, self.driverPath = self.getDriverPath( self.driverFolder, self.browser ) # download driver if not self.driverInstalledBool: msgTxt = "Installing driver <br>" self.announcer.announce(self.announcer.format_sse(msgTxt)) if self.browser == "Chrome": browserDriverDownloadPage, _, _ = download.getRequest( "https://chromedriver.chromium.org/downloads" ) pattern = r"ChromeDriver (" \ + version.split(".")[0] \ + r"\.\d*\.\d*\.\d*)" existingDriverVersion = re.findall( pattern, browserDriverDownloadPage.content.decode("utf-8") )[0] browserDriverDownloadUrl = \ "https://chromedriver.storage.googleapis.com/" \ + existingDriverVersion \ + "/chromedriver_win32.zip" elif self.browser == "Edg": browserDriverDownloadUrl = \ "https://msedgedriver.azureedge.net/" \ + version \ + "/edgedriver_win64.zip" else: print("Browser not supported yet") msgTxt = "Driver URL: " + browserDriverDownloadUrl + "<br>" self.announcer.announce(self.announcer.format_sse(msgTxt)) driverRequest = download.getRequest(browserDriverDownloadUrl)[0] driverZip = zipfile.ZipFile(io.BytesIO(driverRequest.content)) driverZip.extractall(self.driverFolder) msgTxt = "Downloaded and extracted driver <br>" self.announcer.announce(self.announcer.format_sse(msgTxt)) # get driver path self.driverInstalledBool, self.driverPath = self.getDriverPath( self.driverFolder, self.browser ) else: msgTxt = "Driver already satisfied <br>" self.announcer.announce(self.announcer.format_sse(msgTxt)) # Create driver self.driver = self.createDriver( self.browser, self.driverPath, self.headless ) msgTxt = "Started Driver <br>" self.announcer.announce(self.announcer.format_sse(msgTxt)) def createDriver(self, browser, driverPath, headless=None): """ Start selenium web driver Args: browser (str): Browser type driverPath (Path): Path to driver headless (bool): Headless bool Returns: driver: selenium driver """ self.headless = headless if browser == "Edg": edge_options = EdgeOptions() if self.headless: # make Edge headless edge_options.use_chromium = True edge_options.add_argument("headless") edge_options.add_argument("disable-gpu") edge_options.add_argument("--log-level=3") edge_options.add_experimental_option( 'excludeSwitches', ['enable-logging'] ) # edge_options.page_load_strategy("eager") self.driver = Edge( executable_path=str(driverPath), options=edge_options ) elif browser == "Chrome": chrome_options = Options() if self.headless: chrome_options.add_argument("--headless") chrome_options.add_argument("--log-level=3") chrome_options.add_experimental_option( 'excludeSwitches', ['enable-logging'] ) # chrome_options.page_load_strategy("eager") # don't know the chrome command self.driver = webdriver.Chrome( executable_path=str(driverPath), options=chrome_options ) else: print("Browser not supported yet") self.driver.set_window_size(1800, 1080) self.driver.set_page_load_timeout(100000) return self.driver