def read_cookies(): # c_service = Service('E:\work\\twbot\chromedriver.exe') c_service = Service('/opt/google/chrome/chromedriver') c_service.command_line_args() c_service.start() #chrome option = webdriver.ChromeOptions() # option.set_headless() option.add_argument("--headless") option.add_argument("--no-sandbox") driver = webdriver.Chrome(chrome_options=option) # driver = webdriver.Chrome() #firefox # option = webdriver.FirefoxOptions() # # option.add_argument("headless") # # option.add_argument('--no-sandbox') # option.set_headless() # driver = webdriver.Firefox(firefox_options=option) driver.get("https://mobile.twitter.com") with open("qrsncookies.txt", "r") as fp: cookies = json.load(fp) for cookie in cookies: if 'expiry' in cookie: del cookie['expiry'] driver.add_cookie(cookie) driver.get("https://mobile.twitter.com/home") print driver.title time.sleep(20) return driver, c_service
def main(): service = Service('/Users/catemiller/vax-finder/chromedriver') service.start() driver = webdriver.Remote(service.service_url) # CVS print("Searching CVS") driver.get('https://www.cvs.com/immunizations/' 'covid-19-vaccine?icid=coronavirus-lp-nav-vaccine') driver.find_element_by_link_text("California").click() cities = driver.find_elements_by_xpath('.//span[@class = "city"]') status = driver.find_elements_by_xpath('.//span[@class = "status"]') print(" Appointments are available in the following cities:") for ii in range(0, len(status)): if (status[ii].text == "Available"): print(" " + cities[ii].text) print(" To book an appointment: ") print(' https://www.cvs.com/vaccine/' 'intake/store/covid-screener/covid-qns') driver.quit()
def initialize_web_driver(): DRIVER_PATH = r"C:\Users\jerem\Desktop\chromedriver.exe" service = Service(DRIVER_PATH) service.start() wd = webdriver.Remote(service.service_url) wd.quit() return DRIVER_PATH
def get_movie_url(url): data_dict = {} i = 0 html = ask_url(url) doc = etree.HTML(html) # 所有级数的a标签和文本 all_url = doc.xpath('//div[@class="fed-play-item fed-drop-item fed-visible"]//ul[@class="fed-part-rows"]/li/a/@href') all_title = doc.xpath('//div[@class="fed-play-item fed-drop-item fed-visible"]' '//ul[@class="fed-part-rows"]/li/a/text()') # 用selenium获取iframe里的src c_service = Service('/usr/bin/chromedriver') c_service.command_line_args() c_service.start() option = webdriver.ChromeOptions() option.add_argument('--headless') option.add_argument('--no-sandbox') option.add_argument('--disable-dev-shm-usage') browser = webdriver.Chrome('/usr/bin/chromedriver', options=option) # print('正在爬取视频链接中') for url in all_url: browser.get('https://kuyun.tv'+url) movie_url = browser.find_element_by_id('fed-play-iframe').get_attribute('src') data_dict[all_title[i]] = movie_url i = i+1 browser.quit() c_service.stop() return data_dict
class WebTestModule: def __init__(self,initPage:str = None): configLoader = ConfigLoader() self.url=configLoader.getUrl() self.driverPath=configLoader.getChromeDriverPath() self.initWebdriver(initPage) def initWebdriver(self,initPage:str): self.service = Service(self.driverPath) self.service.start() capabilities = {'acceptSslCerts': True} self.driver = webdriver.Remote(self.service.service_url,capabilities) self.driver.implicitly_wait(5) if initPage is None: self.driver.get(self.url) else: self.driver.get(initPage) #return driver def login(self,username:str,password:str): usernameTextField=self.driver.find_element_by_id('usernameText') usernameTextField.send_keys(username) passwordTextField=self.driver.find_element_by_id('passwordText') passwordTextField.send_keys(password) loginButton=self.driver.find_element_by_xpath("//input[@value='login']") loginButton.click() return HomePage(self) def logout(self): self.driver.find_element_by_id('logoutNav').click() def close(self): self.driver.quit()
def before_all(context): service = Service('drivers\\operadriver.exe') service.start() test_extension = "..\\restman-0.3.nex" b64ext = base64.b64encode(open(test_extension, 'rb').read()) capabilities = { 'operaOptions': { 'binary': 'C:\\Program Files (x86)\\Opera Next\\30.0.1835.49\\opera.exe', 'extensions': [b64ext], }, 'Proxy': { 'proxyType': 'system' } } # Start web server (httpbin FTW!) context.server = subprocess.Popen([sys.executable, '-m', 'httpbin.core']) # Create browser context.browser = webdriver.Remote(service.service_url, capabilities) # Start extension context.browser.get('chrome-extension://fohkgjiaiapkkjjchddmhaaaghjakfeg/index.html') time.sleep(1) # Wait for app to load
def punch(StudentId, Name): c_service = Service('/Users/wq//Downloads/chromedriver') c_service.command_line_args() c_service.start() driver = webdriver.Chrome( '/Users/wq/Downloads/chromedriver') # 选择Chrome浏览器 driver.get('http://xsc.sicau.edu.cn/SPCP') # 打开网站 #采用xpath定位 result = driver.find_element_by_xpath('//*[@id="code-box"]') text = result.text driver.find_element_by_xpath('//*[@id="StudentId"]').click() driver.find_element_by_xpath('//*[@id="StudentId"]').send_keys(StudentId) driver.find_element_by_xpath('//*[@id="Name"]').click() driver.find_element_by_xpath('//*[@id="Name"]').send_keys(Name) driver.find_element_by_xpath('//*[@id="codeInput"]').click() driver.find_element_by_xpath('//*[@id="codeInput"]').send_keys(text) driver.find_element_by_xpath('//*[@id="Submit"]').click() driver.find_element_by_xpath('//*[@id="platfrom2"]').click() try: driver.find_element_by_xpath('//*[@id="ckCLS"]').click() driver.find_element_by_xpath('//*[@id="SaveBtnDiv"]/button').click() except: driver.find_element_by_xpath( '//*[@id="layui-layer1"]/div[3]/a').click() driver.quit() c_service.stop()
def dasked_get_images(url): service = Service(DRIVER_PATH) service.start() driver = webdriver.Remote(service.service_url) ssl._create_default_https_context = ssl._create_unverified_context title = url.split("/")[-1] driver.get(url) local_images = driver.find_elements_by_tag_name("img") img_set = set() for i in local_images: try: srcset = i.get_attribute("srcset") cdn_link = srcset.split("900w, ")[-1].replace("1080w", "").strip() if "/logo_" in cdn_link: continue if "-logo-" in cdn_link: continue if cdn_link: img_set.add(f"https:{cdn_link}") except Exception: pass img_set = list(img_set) ret = [(title + "_" + str(uuid.uuid4()).replace("-", ""), cdn_link) for cdn_link in img_set] driver.quit() return ret
def get_product_links(): service = Service(DRIVER_PATH) service.start() driver = webdriver.Remote(service.service_url) ssl._create_default_https_context = ssl._create_unverified_context base_url = "https://worldwidecorals.com" output_path = f'./wwc/' # get all product links from all pages links = [] for i in range(1, 8): driver.get(f'{base_url}/collections/all?page={i}') __scroll_down_page(driver) os.makedirs(output_path, exist_ok=True) # get the image source links.extend([ element.get_attribute("href") for element in driver.find_elements_by_class_name('grid-product__link') ]) driver.quit() return links
def GetThread(self): from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument("--headless") service = Service('./chromedriver') service.start() driver = webdriver.Remote(service.service_url, desired_capabilities=chrome_options.to_capabilities()) driver.implicitly_wait(10) driver.get("https://twitter.com/{user}/status/{tweet_id}".format(user=self.USER.screen_name, tweet_id=self.FirstTweetID)) try: wait = WebDriverWait(driver, 10) element = wait.until(EC.title_contains(self.USER.name)) links = driver.find_elements_by_xpath('//a[contains(@href, "{}/status")]'.format(self.USER.screen_name)) for link in links: self.Tweets.append(link.get_attribute('href').split('/status/')[-1]) except: print("Loading took too much time!") driver.quit() self.Tweets.sort() return self.Tweets
def crawler(url): service = Service('driver/chromedriver') service.start() # pass url to check if it's correct url = correct_url(url) driver = webdriver.Remote(service.service_url) driver.get(url) # crawler starts here # s = driver.find_element_by_css_selector("p").text # print(s) all_hover_elements = driver.find_elements_by_class_name('container') # p_elements = driver.find_elements_by_class_name # print(p_elements) for hover_element in all_hover_elements: # hover_element = driver p_element = hover_element.find_element_by_css_selector('p').text # product_name = p_element.get_attribute("body") print(p_element) # sleep after 15 seconds time.sleep(10) driver.quit()
def parse(url): service = Service('/usr/local/bin/chromedriver') service.start() driver = webdriver.Remote(service.service_url) driver.get(url) source_code = driver.page_source driver.quit() return source_code
def init_driver(): # Locate Driver in system Path = "C:\SeleniumDrivers\chromedriver.exe" service = Service(Path) service.start() driver = webdriver.Remote(service.service_url) driver.implicitly_wait(40) return driver
def parse(url): service = Service('/usr/local/bin/chromedriver') service.start() driver = webdriver.Chrome(ChromeDriverManager().install()) driver.get(url) time.sleep(120) source_code = driver.page_source driver.quit() return source_code
def _get_selenium_service(cls) -> Service: if not hasattr(cls, '__selenium_service'): path = ChromeDriverLoader.driver_path if not path: raise AttributeError('Get empty driver path.') service = Service(path) service.start() setattr(cls, '__selenium_service', service) return getattr(cls, '__selenium_service')
def startWebDriverService(): """ Starts the web driver as a service, this improves performance and does not require a manual start of the chrome driver. TODO: Make it actually work. """ service = Service(D_PATH_CHROMEDRIVER) service.start() driver = webdriver.Remote(service.service_url) return driver
class DownloadDataSet: def construct(self): # Use the executable file in my desktop to start the chromedriver self.service = Service('/home/abilashbodapati/Desktop/chromedriver') # Start the chromedriver service self.service.start() # Target web-url initialized (Github login-page) self.driver = webdriver.Remote(self.service.service_url) def startChromeService(self, url): # Use the executable file in my desktop to start the chromedriver # service = Service('/home/abilashbodapati/Desktop/chromedriver') # Start the chromedriver service # self.service.start() # Target web-url initialized (Github login-page) # driver = webdriver.Remote(service.service_url) self.driver.get(url) # Maximize the window when the webpage is opened self.driver.maximize_window() def downloadData(self): # By now we are in the webpage for dataset # Click on the Download button self.driver.find_element_by_xpath( "/html/body/main/div[1]/div/div[5]/div[2]/div[1]/div/div/div[2]/div[2]/div[1]/div[2]/a[1]/div/span" ).click() time.sleep(4) # Click on the Signin button self.driver.find_element_by_xpath( "/html/body/main/div[1]/div[1]/div/form/div[2]/div/div[1]/a/li" ).click() time.sleep(5) self.driver.find_element_by_xpath( "/html/body/div[1]/div[1]/div[2]/div/div[2]/div/div/div[2]/div/div[1]/div/form/span/section/div/div/div[1]/div/div[1]/div/div[1]/input" ).send_keys('*****@*****.**') time.sleep(20) self.driver.find_element_by_xpath( "/html/body/div[1]/div[1]/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div/div[1]/div/div/span/span" ).click() time.sleep(35) # Click on the Download button again after signed in self.driver.find_element_by_xpath( "/html/body/main/div[1]/div/div[5]/div[2]/div[1]/div/div/div[2]/div[2]/div[1]/div[2]/a[1]/div/span" ).click() time.sleep(1.75) def moveDataToFolder(self): # Unzip the folder from Downloads to Datasets folder. os.system( 'unzip -q /home/abilashbodapati/Downloads/*_bundle_archive.zip -d ./Datasets' )
def request_raw_html_from_wiki() -> str: service = Service('./chromedriver/chromedriver') service.start() options = Options() options.add_argument('--headless') options.add_argument('--disable-gpu') driver = webdriver.Remote(service.service_url, options=options) driver.get(stardew_fish_url) page = driver.page_source driver.quit() return page
def init_against_external_service(cls): service = Service(CHROMEDRIVER_BINARY_PATH) service.start() service_url = service.service_url # # for manually-run server # service_url = 'http://localhost:9515' driver = Remote(service_url, DesiredCapabilities.CHROME) return cls(driver)
def get_driver(): chromedriver_path = webium.settings.chromedriverpath c_service = Service(chromedriver_path) c_service.command_line_args() c_service.start() chrome_options = Options() if hasattr(webium.settings, "chrome_nosandbox") and webium.settings.chrome_nosandbox: chrome_options.add_argument('--no-sandbox') if hasattr(webium.settings, "chrome_disable_shmusage" ) and webium.settings.chrome_disable_shmusage: chrome_options.add_argument('--disable-dev-shm-usage') if hasattr(webium.settings, "chrome_ignore_certificate_errors" ) and webium.settings.chrome_ignore_certificate_errors: chrome_options.add_argument('--ignore-certificate-errors') if hasattr(webium.settings, "chrome_disable_gpu") and webium.settings.chrome_disable_gpu: chrome_options.add_argument('--disable-gpu') if hasattr(webium.settings, "chrome_disable_plugins" ) and webium.settings.chrome_disable_plugins: chrome_options.add_argument('--disable-plugins') if hasattr(webium.settings, "chrome_handless") and webium.settings.chrome_handless: chrome_options.add_argument('--headless') service_args = [] if hasattr(webium.settings, "service_load_images") and webium.settings.service_load_images: service_args.append('--load-images=yes') else: service_args.append('--load-images=no') if hasattr(webium.settings, "service_disk_cache") and webium.settings.service_disk_cache: service_args.append('--disk-cache=yes') else: service_args.append('--disk-cache=no') if hasattr(webium.settings, "service_ignore_ssl_errors" ) and webium.settings.service_ignore_ssl_errors: service_args.append('--ignore-ssl-errors=true') else: service_args.append('--ignore-ssl-errors=false') global _driver_instance if not _driver_instance: _driver_instance = webdriver.Chrome(chrome_options=chrome_options, service_args=service_args, executable_path=chromedriver_path) _driver_instance.implicitly_wait(webium.settings.implicit_timeout) return _driver_instance
def initializeDriver(browser_name='google', driver='/opt/google/chrome/chromedriver'): print('initializeDriver: create a ' + browser_name + ' driver') google_driver = driver if browser_name == 'google': c_service = Service(google_driver) c_service.command_line_args() c_service.start() options = webdriver.ChromeOptions() options.add_argument('--headless') browser = webdriver.Chrome(executable_path=google_driver, chrome_options=options) return browser, c_service
def setUp(self): super(ChromiumWebDriverFixture, self).setUp() # Import late to avoid hard dependency. from selenium.webdriver.chrome.service import Service as ChromeService service = ChromeService("/usr/lib/chromium-browser/chromedriver", 4444) # Set the LD_LIBRARY_PATH so the chrome driver can find the required # libraries. self.useFixture( EnvironmentVariable("LD_LIBRARY_PATH", "/usr/lib/chromium-browser/libs")) service.start() # Stop service on cleanup. self.addCleanup(service.stop)
class ChromeDriverManager(DriverManager): __chservice = None def launch_browser(self): chrome_option = Options() chrome_option.add_argument("--disable-infobars") chrome_option.add_argument("--start-maximized") chrome_option.add_argument("--disable-popup-blocking") cur_dir_path = os.path.dirname(os.path.realpath(__file__)) chromedriver = cur_dir_path.split( sep='\\base')[0] + QEEnvironment.get_environment_dict().get( 'BrowserPath') os.environ["webdriver.chrome.driver"] = chromedriver driver = webdriver.Chrome(chromedriver, options=chrome_option) driver.get('https://ui.cogmento.com/') def start_service(self): try: if self.__chservice == None: cur_dir_path = os.path.dirname(os.path.realpath(__file__)) driver_path = cur_dir_path.split( sep='\\base')[0] + QEEnvironment.get_environment_dict( ).get('BrowserPath') self.__chservice = Service(driver_path) self.__chservice.start() print('Service is started') except: print(traceback.print_exc()) def stop_service(self): if self.__chservice != None and self.__chservice.is_connectable(): print('Stop service') self.__chservice.stop() def create_driver(self): chrome_option = webdriver.ChromeOptions() chrome_option.add_argument("--disable-infobars") chrome_option.add_argument("--start-maximized") chrome_option.add_argument("--disable-popup-blocking") capabilities = DesiredCapabilities.CHROME.copy() capabilities['browser'] = 'chrome' capabilities = chrome_option.to_capabilities() self.driver = webdriver.Remote(self.__chservice.service_url, desired_capabilities=capabilities) self.edriver = EventFiringWebDriver(self.driver, EventListener()) self.edriver.implicitly_wait( QEEnvironment.get_environment_dict().get('ImplicitWait')) self.edriver.get(QEEnvironment.get_environment_dict().get('URL'))
def getcook(): loginurl = 'http://113.57.169.227:8088/ccps/login.jsp' # 登录页面 path = r'd:\chromedriver.exe' # 加载webdriver驱动,用于获取登录页面标签属性 # driver = webdriver.Chrome(r'd:\chromedriver.exe') # option = webdriver.ChromeOptions() # option.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe' # option.add_argument('--headless') #增加无界面选项 # option.add_argument('--disable-gpu') #如果不加这个选项,有时定位会出现问题 # option.add_experimental_option('excludeSwitches', ['enable-logging']) c_service = Service(path) c_service.command_line_args() c_service.start() chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(executable_path=path, options=chrome_options) driver.get(loginurl) # 请求登录页面 driver.find_element_by_id('wcode').clear() # 获取用户名输入框,并先清空 driver.find_element_by_id('wcode').send_keys(u'WHBK100') # 输入用户名 driver.find_element_by_id('password').clear() # 获取密码框,并清空 driver.find_element_by_id('password').send_keys(u'') # 输入密码 #captcha = driver.find_element_by_id('captcha_image') # 获取验证码标签 #submit = driver.find_element_by_css_selector('a[name="登录"]') # 获取提交按钮 submit = driver.find_element_by_link_text("登录") # 判断是否需要验证码 captcha = [] if captcha: captcha_field = driver.find_element_by_id('captcha_field') # 获取验证码输入框 text = input("请输入验证码:") # 控制栏输入验证码 captcha_field.send_keys(text) # 将输入的验证码传递给selenium打开的浏览器 submit.click() # 按钮提交并登录 else: submit.click() # 无验证码则直接登录提交 cookies = driver.get_cookies() # 获取COOK #driver.get('http://113.57.169.227:8088/ccps//workorder/findWorkOrderList.action?workOrder.range=yff&workOrder.standby3=order_deal') # 请求其他页面 time.sleep(1) driver.quit() c_service.stop() #print(cookies) return cookies # 返回cookies 之后其他方法可以调用,这样不用每次请求都返回登录
def get_urls(xingqi, A): #由于在后台打开浏览器,因此不能很好的关闭,所以用service.start(),service.close()控制进程开关 c_service = Service( 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') c_service.command_line_args() c_service.start() #使用谷歌自带的无头浏览器模式,悄无声息地运行 opt = Options() opt.add_argument('--headless') browser = webdriver.Chrome(chrome_options=opt) wait = WebDriverWait(browser, 10) #设置延迟10秒,等待网页加载 # browser.get('http://www.qqshidao.com/index.php?c=home&a=bifen') time.sleep(3) submit = wait.until( EC.element_to_be_clickable( (By.XPATH, '//*[@id="app"]/div[6]/div/span[6]'))) submit.click() time.sleep(2) js = 'var q=document.documentElement.scrollTop=100000' #设置往下拉网页的长度,设置大点,直接拉到底部 browser.execute_script(js) #发现当比赛较多时,往下拉一下后加载新的数据,因此,自动往下拉一下就停了 time.sleep(1) browser.execute_script(js) #再往下拉一下,加载全部比赛 time.sleep(2) yuanma = browser.page_source browser.quit() c_service.stop() s = etree.HTML(yuanma) urls = [] urls_ = s.xpath('//*[@id="app"]/div[7]/div/table/tbody/tr[@data-fid]') for each in urls_: fid = each.attrib['data-fid'] xingqiji = each.xpath('./td[3]/text()')[0] if xingqi in xingqiji: url = 'http://www.qqshidao.com/index.php?c=odds&a=betfair&fid={}'.format( fid) A['{}'.format(url)] = xingqiji urls.append(url) return urls
def main(): # seed cause random random.seed() # get from file a pun to share try: with open("jokes.txt", 'r') as file: for i in range(random.randrange(0, 76)): read_data = file.readline() except: print('Could not open file.') exit(1) file.close() # Facebook service = Service( '/python-selenium/chromedriver') #path to the chromedriver service.start() driver = webdriver.Remote(service.service_url) driver.get('https://www.facebook.com') username = driver.find_element_by_id("email") password = driver.find_element_by_id("pass") submit = driver.find_element_by_id("loginbutton") # facebook login username.send_keys("YOUR-EMAIL") password.send_keys(pw) time.sleep(3.2) # wait for a bit submit.click() time.sleep(4.8) # wait for a bit # setup the post to share a joke status = driver.find_element_by_xpath("//textarea[@name='xhpc_message']") status.send_keys(str(read_data)) postbutton = driver.find_element_by_xpath("//button[contains(.,'Post')]") # post! postbutton.click() time.sleep(7.8) # wait for a bit driver.quit()
class Headless2: def __init__(self, loop=None, pool_size=10): self.loop = loop if loop else asyncio.get_event_loop() self.driver_options = webdriver.ChromeOptions() self.driver_options.add_argument('headless') self.sem = asyncio.Semaphore(pool_size) self.service = Service('chromedriver') self.service.start() async def init_pool(self, pool_size): for i in range(pool_size): driver = webdriver.Remote( self.service.service_url, desired_capabilities=self.driver_options.to_capabilities()) await self.driver_pool.put(driver) async def get(self, url, locator=None, timeout=10): def _get(_driver, _url): wait = WebDriverWait(_driver, timeout, poll_frequency=1) try: _driver.delete_all_cookies() _driver.get(_url) if locator: wait.until(EC.presence_of_all_elements_located(locator)) else: wait.until(lambda x: x.execute_script( 'return document.readyState;') == 'complete') return _driver.page_source except Exception as e: print(url, e) return None async with self.sem: print(url) driver = webdriver.Remote( self.service.service_url, desired_capabilities=self.driver_options.to_capabilities()) html = await _get(driver, url) driver.quit() print(url) return html
def GetEpubFromHaoDoo(pLink, code, book): service = Service('/usr/bin/chromedriver') service.start() driver = webdriver.Remote(service.service_url) driver.implicitly_wait(100) # seconds driver.get(pLink) submitElement = driver.find_element_by_xpath('//input[@value="下載 epub 檔"]') submitElement.click() btn_in_modal_locator = (By.ID, 'okButton') wait = WebDriverWait(driver, 100) btn_in_modal = wait.until(EC.element_to_be_clickable(btn_in_modal_locator)) btn_in_modal.click() file_path = '/home/raylex/Downloads/' + code + '.epub' while not os.path.exists(file_path): time.sleep(1) os.chdir('/home/raylex/Downloads') os.rename(code + '.epub', book + '.epub') driver.quit() return
class StartScrape(): def __enter__(self): self.chrome_path = '/usr/bin/chromium-browser' self.chromedriver_path = '/usr/lib/chromium/chromedriver' self.o = Options() self.o.binary_location = '/usr/bin/chromium-browser' self.o.add_argument('--headless') self.o.add_argument('--disable-gpu') self.o.add_argument('--no-sandbox') self.o.add_argument('--window-size=1200x600') self.s = Service(executable_path=self.chromedriver_path) self.s.start() self.driver = webdriver.Remote( self.s.service_url, desired_capabilities=self.o.to_capabilities() ) return self.driver def __exit__(self, exception_type, exception_value, traceback): self.driver.quit()
def download_by_webdriver(url, charset='utf-8', proxy=None, user_agent=None): # 传入URL,使用浏览器下载后,返回页面。 print("[download_by_webdriver]: begin download the link %s" % url) try: # 进入浏览器设置 options = webdriver.ChromeOptions() # 谷歌无头模式 options.add_argument('--headless') options.add_argument('--disable-gpu') # options.add_argument('window-size=1200x600') # 设置中文 options.add_argument('lang=zh_CN.UTF-8') # 设置代理 if proxy: print("[download_by_webdriver]: use proxy %s" % proxy) options.add_argument('proxy-server=' + proxy) # 添加头 if user_agent: options.add_argument('user-agent=' + user_agent) else: options.add_argument( 'user-agent=' + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/71.0.3578.98 Safari/537.36') # 设置驱动服务 c_service = Service('/usr/local/bin/chromedriver') c_service.command_line_args() c_service.start() driver = webdriver.Chrome(chrome_options=options) driver.get(url) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") driver.implicitly_wait(10) driver.set_page_load_timeout(15) p_content = driver.page_source.encode(charset, "ignore").decode(charset, 'ignore') current_url = driver.current_url driver.quit() c_service.stop() except Exception as e: print("[download_by_webdriver]:", e) p_content, current_url = None, None return p_content, current_url
def download_by_webdriver(url, charset='utf-8'): # 传入URL,使用浏览器下载后,返回页面。 print("[download_by_webdriver]: begin download the link %s" % url) try: chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') c_service = Service('/usr/local/bin/chromedriver') c_service.command_line_args() c_service.start() driver = webdriver.Chrome(chrome_options=chrome_options) driver.get(url) driver.implicitly_wait(10) content = driver.page_source.encode(charset, "ignore").decode( charset, 'ignore') current_url = driver.current_url driver.quit() c_service.stop() except Exception as e: print("[download_by_webdriver]:", e) content, current_url = None, None return content, current_url
def get_product_links(): service = Service(DRIVER_PATH) service.start() driver = webdriver.Remote(service.service_url) ssl._create_default_https_context = ssl._create_unverified_context logging.info(f"Starting scraping of links for: {base_url}") # get all product links from all pages i = 1 products_per_page = 24 links = [] while True: logging.info(f"Getting links on page # {i}") driver.get(f'{base_url}/product-category/all-livestock/page/{i}/') # __scroll_down_page(driver) # get the image source current_page_links = [ element.get_attribute("href") for element in driver.find_elements_by_tag_name('a') ] current_page_links = [ link for link in current_page_links if link.startswith(f"{base_url}/product/") ] logging.info(f"Found {len(current_page_links)} products on page # {i}") i += 1 links.extend(current_page_links) if len(current_page_links) < products_per_page: logging.info("No more links available") break driver.quit() return set(links)