from selenium import webdriver from selenium.webdriver.chrome.options import Options #从options模板中调用OPtions类 import time from bs4 import BeautifulSoup chrome_options = Options() # 实例化Option对象 chrome_options.add_argument('--headless') # 把Chrome浏览器设置为静默模式 dirver = webdriver.Chrome(options = chrome_options) #设置引擎为Chrome,在后台默默运行 # dirver = webdriver.Chrome() #设置引擎为Chrome,真实的打开一个Chrome浏览器 dirver.get("https://localprod.pandateacher.com/python-manuscript/hello-spiderman/") time.sleep(2) pageSource = dirver.page_source #获取完整渲染的网页源代码 输出'str'类型,实例化 print(type(pageSource)) # label = dirver.find_elements_by_tag_name('label') #解析网页并提取标签'label' 输出'list'类型 a = dirver.find_element_by_css_selector("[class='xl-chrome-ext-bar']") # selenium中查找某标签有空格用css labels = dirver.find_element_by_tag_name('label') print(type(labels)) #解析网页并提取标签'label' 输出'WebElement'类型 print(labels.text) #提取文字 print(labels.get_attribute('type')) #输入参数:属性名,可以提取属性值 # print(type(label)) # for i in label: # print(i.text) # print(label.get_attribute('type')) #获取属性的值 #find_element_by_link_text 通过链接文本获得超链接 #find_element_by_partial_link_text 通过链接部分文本获得超链接
if i != page_all: WebDriverWait(driver, 10).until( EC.text_to_be_present_in_element( (By.XPATH, '//*[@id="datagrid-row-r1-1-9"]/td[1]/div'), str(i * 10))) else: time.sleep(1) try: get_content(driver) except: print('出现异常,请调试代码') if __name__ == '__main__': db = To_db() db.create_db(DB_NAME) chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(chrome_options=chrome_options) # driver=webdriver.Chrome() driver.implicitly_wait(10) url = 'http://www.whzbtb.cn/V2PRTS/OpeningRoomInfoListInit.do' driver.get(url) change_page(driver) db.close_db() driver.close() driver.quit() #finish_all
def __init__(self): self._options = Options() self._options.add_argument('--headless') self._web_driver: WebDriver or None = None self._headed_web_driver: WebDriver or None = None atexit.register(self.cleanup)
def scrape_recipe_sources(recipe_sources, batch_id): """Scrape the given list of recipe sources. The scraping is processed with a process assigned with the given batch ID. Args: recipe_sources: List of recipe sources. A source contains a 'url', the recipe URL, and 'categories', the categories associated with the recipe page. batch_id: Batch ID assigned to the process where the scaping is conducted. Returns: True if the scraping has been completed for all recipe sources. """ options = Options() options.headless = True driver = webdriver.Chrome('./chromedriver', options=options) cache_path = recipe_cache_path(batch_id) scraped_ids = set() if path.exists(cache_path): scraped_ids = set([ row[0] for row in pandas.read_csv(RESULT_CSV_PATH, usecols=['id']).values ]) else: with open(cache_path, 'w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=RECIPE_CSV_FIELDNAMES) writer.writeheader() try: with open(cache_path, 'a') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=RECIPE_CSV_FIELDNAMES) for i, recipe_source in enumerate(recipe_sources, start=1): print('Batch {} processing recipe #{}'.format(batch_id, i)) recipe_id = recipe_id_from_recipe_url(recipe_source['url']) if recipe_id in scraped_ids: continue recipe_content = None # Sometimes the driver experiences a connection failure. Keep trying to scrape one page until it succeeded. try: recipe_content = scrape_single_recipe_url( recipe_source['url'], recipe_source['categories'], driver) except: # Instantiate a new driver. try: driver.close() driver.quit() finally: time.sleep(1) driver = webdriver.Chrome('./chromedriver', options=options) recipe_content = None writer.writerow(recipe_content) finally: driver.close() driver.quit() return True
def dr(): chrome_options = Options() chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options) return driver
async def carbon_api(e): if not e.text[0].isalpha() and e.text[0] not in ("/", "#", "@", "!"): """ A Wrapper for carbon.now.sh """ await e.edit("`Processing..`") CARBON = 'https://carbon.now.sh/?l={lang}&code={code}' global CARBONLANG textx = await e.get_reply_message() pcode = e.text if pcode[8:]: pcode = str(pcode[8:]) elif textx: pcode = str(textx.message) # Importing message to module code = quote_plus(pcode) # Converting to urlencoded await e.edit("`Meking Carbon...\n25%`") url = CARBON.format(code=code, lang=CARBONLANG) chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.binary_location = GOOGLE_CHROME_BIN chrome_options.add_argument("--window-size=1920x1080") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-gpu") prefs = {'download.default_directory' : './'} chrome_options.add_experimental_option('prefs', prefs) driver = webdriver.Chrome(executable_path=CHROME_DRIVER, options=chrome_options) driver.get(url) await e.edit("`Be Patient...\n50%`") download_path = './' driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command') params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_path}} command_result = driver.execute("send_command", params) driver.find_element_by_xpath("//button[contains(text(),'Export')]").click() driver.find_element_by_xpath("//button[contains(text(),'4x')]").click() driver.find_element_by_xpath("//button[contains(text(),'PNG')]").click() await e.edit("`Processing..\n75%`") # Waiting for downloading sleep(2.5) await e.edit("`Done Dana Done...\n100%`") file = './carbon.png' await e.edit("`Uploading..`") await e.client.send_file( e.chat_id, file, caption="<< Here's your carbon, gey boi! \n Carbonised by @inferno_scorpion >> ", force_document=True, reply_to=e.message.reply_to_msg_id, ) os.remove('./Anubis.png') driver.quit() # Removing carbon.png after uploading await e.delete() # Deleting msg
def launch_browser(): if env.RUNNING_BROWSER.upper() == "FIREFOX": #os.popen("TASKKILL /F /IM firefox.exe") fp = FirefoxProfile() fp.native_events_enabled = False binary_path = common.get_value_from_conf("FIREFOX_BINARY_PATH") if binary_path == "": env.BROWSER = webdriver.Firefox(firefox_profile=fp) else: fb = FirefoxBinary(firefox_path=binary_path) env.BROWSER = webdriver.Firefox(firefox_profile=fp, firefox_binary=fb) elif env.RUNNING_BROWSER.upper() == "CHROME": #os.popen("TASKKILL /F /IM chrome.exe") os.popen("TASKKILL /F /IM chromedriver.exe") binary_path = common.get_value_from_conf("CHROME_BINARY_PATH") chromedriver = common.get_value_from_conf("DRIVER_CHROME") if binary_path == "": os.environ["webdriver.chrome.driver"] = chromedriver env.BROWSER = webdriver.Chrome(executable_path=chromedriver) else: opts = Options() opts.binary_location = binary_path os.environ["webdriver.chrome.driver"] = chromedriver env.BROWSER = webdriver.Chrome(executable_path=chromedriver, chrome_options=opts) elif env.RUNNING_BROWSER.upper() == "IE": #os.popen("TASKKILL /F /IM iexplore.exe") os.popen("TASKKILL /F /IM IEDriverServer.exe") dc = DesiredCapabilities.INTERNETEXPLORER.copy() dc['acceptSslCerts'] = True dc['nativeEvents'] = True iedriver = common.get_value_from_conf("DRIVER_IE") os.environ["webdriver.ie.driver"] = iedriver env.BROWSER = webdriver.Ie(executable_path=iedriver, capabilities=dc) else: return False env.TEST_URL = common.get_value_from_conf("TESTING_URL") env.BROWSER.get(env.TEST_URL) env.BROWSER.maximize_window() time.sleep(3) return True
def __init__(self): chrome_options = Options() chrome_options.add_argument('--headless') self.browser = webdriver.Chrome(chrome_options=chrome_options) super(JdgoodsSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
async def carbon_api(e): """ carbon.now.sh için bir çeşit wrapper """ await e.edit("`İşleniyor...`") CARBON = 'https://carbon.now.sh/?l={lang}&code={code}' global CARBONLANG textx = await e.get_reply_message() pcode = e.text if pcode[8:]: pcode = str(pcode[8:]) elif textx: pcode = str(textx.message) # Girilen metin, modüle aktarılıyor. code = quote_plus(pcode) # Çözülmüş url'ye dönüştürülüyor. await e.edit("`İşleniyor...\nTamamlanma Oranı: 25%`") if os.path.isfile("./carbon.png"): os.remove("./carbon.png") url = CARBON.format(code=code, lang=CARBONLANG) chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.binary_location = GOOGLE_CHROME_BIN chrome_options.add_argument("--window-size=1920x1080") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-gpu") prefs = {'download.default_directory': './'} chrome_options.add_experimental_option('prefs', prefs) driver = webdriver.Chrome(executable_path=CHROME_DRIVER, options=chrome_options) driver.get(url) await e.edit("`İşleniyor...\nTamamlanma Oranı: 50%`") download_path = './' driver.command_executor._commands["send_command"] = ( "POST", '/session/$sessionId/chromium/send_command') params = { 'cmd': 'Page.setDownloadBehavior', 'params': { 'behavior': 'allow', 'downloadPath': download_path } } command_result = driver.execute("send_command", params) driver.find_element_by_xpath("//button[contains(text(),'Export')]").click() # driver.find_element_by_xpath("//button[contains(text(),'4x')]").click() # driver.find_element_by_xpath("//button[contains(text(),'PNG')]").click() await e.edit("`İşleniyor...\nTamamlanma Oranı: 75%`") # İndirme için bekleniyor while not os.path.isfile("./carbon.png"): await sleep(0.5) await e.edit("`İşleniyor...\nTamamlanma Oranı: 100%`") file = './carbon.png' await e.edit("`Resim karşıya yükleniyor...`") await e.client.send_file( e.chat_id, file, caption="Bu resim [Carbon](https://carbon.now.sh/about/) kullanılarak yapıldı,\ \nbir [Dawn Labs](https://dawnlabs.io/) projesi.", force_document=True, reply_to=e.message.reply_to_msg_id, ) os.remove('./carbon.png') driver.quit() # Karşıya yüklemenin ardından carbon.png kaldırılıyor await e.delete() # Mesaj siliniyor
async def carbon_api(e): if not e.text[0].isalpha() and e.text[0] not in ("/", "#", "@", "!"): """ A Wrapper for carbon.now.sh """ await e.edit("⬜⬜⬜⬜⬜") CARBON = 'https://carbon.now.sh/?l={lang}&code={code}' CARBONLANG = "en" textx = await e.get_reply_message() pcode = e.text if pcode[8:]: pcode = str(pcode[8:]) elif textx: pcode = str(textx.message) # Importing message to module code = quote_plus(pcode) # Converting to urlencoded url = CARBON.format(code=code, lang=CARBONLANG) chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.binary_location = Config.GOOGLE_CHROME_BIN chrome_options.add_argument("--window-size=1920x1080") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('--disable-gpu') prefs = {'download.default_directory': './'} chrome_options.add_experimental_option('prefs', prefs) await e.edit("⬛⬛⬜⬜⬜") driver = webdriver.Chrome(executable_path=Config.CHROME_DRIVER, options=chrome_options) driver.get(url) download_path = './' driver.command_executor._commands["send_command"] = ( "POST", '/session/$sessionId/chromium/send_command') params = { 'cmd': 'Page.setDownloadBehavior', 'params': { 'behavior': 'allow', 'downloadPath': download_path } } command_result = driver.execute("send_command", params) driver.find_element_by_xpath( "//button[contains(text(),'Export')]").click() sleep(5) # this might take a bit. driver.find_element_by_xpath("//button[contains(text(),'4x')]").click() sleep(5) await e.edit("⬛⬛⬛⬜⬜") driver.find_element_by_xpath( "//button[contains(text(),'PNG')]").click() sleep(5) #Waiting for downloading await e.edit("⬛⬛⬛⬛⬛") file = './carbon.png' await e.edit("✅Carbon Completed, Uploading Carbon✅") await e.client.send_file( e.chat_id, file, caption= "Carbon by [@r4v4n4](https://www.github.com/ravana69/pornhub)", force_document=False, reply_to=e.message.reply_to_msg_id, ) os.remove('./carbon.png') # Removing carbon.png after uploading await e.delete() # Deleting msg
def __init__(self,keyword): # 关键词 self.keyword = keyword self.cookies = {} # cookie池 self.cookie = [ 'thw=cn; t=30912a0211d2f7c4b616585bc4825060; hng=CN%7Czh-CN%7CCNY%7C156; enc=pzdR76EQ9XgSRGR82Xq45tmJruRFWu0FouJ8kQAkE3nawWt6z1uotCujQi0PcMIZI%2FB7iYyg4rl8rsxLX1xJSA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _m_h5_tk=bd1525710e4afb9c577d8a990f3353b7_1564926869087; _m_h5_tk_enc=5a6d2edb8c06b13c3c5e8c0a0e6dd566; cookie2=166f3000b9672c536c59566b63e90b79; _tb_token_=eeb3a3bb33a33; _uab_collina=156492382396750161615565; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; swfstore=200907; mt=ci=0_0; cna=353FFR0e5FsCAXa3WpLq0giz; v=0; x5sec=7b227365617263686170703b32223a223464386535366432633737383636646133303535303466613238643738363734434f572b6d2b6f46454b2b367a35364f334b625954526f504d6a49774d7a41334d6a597a4e6a4d7a4d447378227d; JSESSIONID=8405DFCA73B5B3B1C30D44ED9D1A4A79; isg=BEVFs3FGtIJs1ZBax07XR-RvVIG_qvjxG33nnEeq5nyL3mdQDlZSZIH86EKNnhFM; l=cBSHVLunqYl65142BOfZCuI8LPbt5IRbzsPzw4OG4ICPOb5e5cvcWZFPC28wCnGVK6uJJ3oWYJ1uB0L5yyCqJxpsw3k_J_f..', # 'thw=cn; t=4a67cf0f54b38a06b12baa6d7011ac01; enc=z0TfWvQ9HWXGg%2FRoa2MY2HYj2UfgrfgniIYK%2FEv2r%2FGt32csHyi8iBOmGabkyql62Uuf9%2BYrgcukKLieAnE%2FjA%3D%3D; mt=ci=0_0; cna=353FFR0e5FsCAXa3WpLq0giz; hng=CN%7Czh-CN%7CCNY%7C156; v=0; cookie2=1188c00dd90500ec2caf188256b95566; _tb_token_=555f6ebe1f33b; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; JSESSIONID=8D4A142105251AA3BD42FD981CC4D589; uc1=cookie14=UoTaHP3Aq5rlzQ%3D%3D; isg=BC0t8WQ9DM36--hecDw_dwcBPMlnImA0s3X_RG8wZkQU5kmYNNvhLE_g0fql0nkU; l=cBTFlrMmqbmpLQB3BOfgCuI8Ls7OmQAfCfVzw4OGjICP9mCwkrwcWZFXDALeCnhVp6UM83oWYJ1uBeYBqtftHxoD2j-la', # 'thw=cn; t=4a67cf0f54b38a06b12baa6d7011ac01; enc=z0TfWvQ9HWXGg%2FRoa2MY2HYj2UfgrfgniIYK%2FEv2r%2FGt32csHyi8iBOmGabkyql62Uuf9%2BYrgcukKLieAnE%2FjA%3D%3D; mt=ci=0_0; cna=353FFR0e5FsCAXa3WpLq0giz; hng=CN%7Czh-CN%7CCNY%7C156; v=0; cookie2=1188c00dd90500ec2caf188256b95566; _tb_token_=555f6ebe1f33b; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; uc1=cookie14=UoTaHP3Aq5rlzQ%3D%3D; JSESSIONID=D1451FA603364663FBC5F784EEC1FCC0; isg=BLW1cbogBDUCUGD2CFQHX08pxDGvmmgsay137DfaIix7DtQA24PHFIzMWZKdeoH8; l=cBTFlrMmqbmpL_gtBOCwSuI8Ls79YIR2muPRwC0Xi_5Q49L6OfbOkStYshp6DjWd9SJ640tUd_29-etliOHx3mx-g3fP.', ] # user-agent池 self.user_agent = [ "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", ] self.headers = { 'cookie': random.choice(self.cookie), 'referer': 'https://s.taobao.com/search', 'user-agent': random.choice(self.user_agent), } # IP代理 self.ip = [ # '117.191.11.111:8080', '117.191.11.113:80', '117.191.11.109:8080', '117.191.11.80:80', '117.191.11.76:8080', '117.191.11.80:80', '117.191.11.108:80', '117.191.11.111:80', '117.191.11.109:8080', '39.135.24.11:80', '117.191.11.109:80', '117.191.11.108:8080', '117.191.11.110:8080', '35.183.111.234:80', '144.217.229.157:1080', '39.137.69.7:80', '39.137.69.7:8080', '39.137.69.10:8080' ] self.proxies = { 'http': random.choice(self.ip), } # 获取当前的年月日 self.date = time.strftime('%Y%m%d',time.localtime(time.time())) # 初始页码 self.page = 0 # 淘宝搜索url self.page_url = 'https://s.taobao.com/search?q={keyword}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_{date}&ie=utf8&s={page}' # 开启一个session会话 self.session = requests.session() # 将cookiesJar赋值给会话 self.session.cookies = self.read_cookies() # mysql数据库 self.connect = pymysql.connect(host='localhost',port=3306,user='******',passwd='0000',db='scrapytest') self.cursor = self.connect.cursor() chrome_options = Options() chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") chrome_drive = r'D:\soft\Chrome\chromedriver.exe' self.driver = webdriver.Chrome(executable_path=chrome_drive, options=chrome_options) self.count = 1 # 最多识别6次
def dgmk(keyword, uname): chrome_path = 'C:/Users/LGPC/Desktop/sparta/Jungo-project/driver/chromedriver_v81/chromedriver_win32/chromedriver' # chrome_path = '/usr/bin/chromedriver' # chrome 브라우저를 headless(non-gui)로 사용하기위한 옵션설정 options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') driver = webdriver.Chrome(options=options, executable_path=chrome_path) driver.implicitly_wait(3) url = 'https://www.daangn.com/search/' + keyword driver.get(url) driver.implicitly_wait(1) # 크롬 웹브라우저 화면에서 [더보기] 클릭 -> 12회 반복 (150개) for i in range(12): try: driver.find_element_by_xpath('//*[@id="result"]/div[1]/div[2]').click() driver.implicitly_wait(1) driver.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) time.sleep(0.5) # [더보기] 버튼이 없을경우, 현재페이지를 크롤링 except: print('Dangn-market more-button end') time.sleep(1) break # url 페이지의 html data 크롤링 html = driver.page_source soup = BeautifulSoup(html, 'html.parser') # 크롤링된 데이터를 DB에 저장 items_uname = 'items_' + uname items = soup.select('div > article.flea-market-article') for item in items: item_img = item.select_one('div.card-photo > img')['src'] item_title = item.select_one('span.article-title').text item_position = item.select_one('p.article-region-name').text item_price = item.select_one('p.article-price').text item_link = item.select_one('article.flea-market-article > a')['href'] doc = { 'keyword': keyword, 'img': item_img, 'title': item_title, 'position': item_position, 'price': item_price, 'link': 'daangn.com' + item_link } db[items_uname].insert_one(doc) # Chrome 브라우저 종료 driver.close() result = db[items_uname].find({'keyword': keyword}) if result is not None: print('Dangn-market crawling success!! (save to DB)') else: print('ERROR!! Dangn-market crawling Fail...')
#!/usr/bin/env python3 import os import time from selenium.webdriver import Chrome from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.options import Options opts = Options() browser = webdriver.Chrome(ChromeDriverManager().install()) browser.get("https://www.nhm.ac.uk/wpy/gallery?tags=") elements = browser.find_elements_by_class_name("ImageGrid__container___YSm77") hrefs = [element.get_attribute('href') for element in elements] # write a file of urls with open("urls.txt", "w") as url_file: for href in hrefs: url_file.write(href + "\n")
async def carbon_api(e): if not e.text[0].isalpha() and e.text[0] not in ("/", "#", "@", "!"): """ A Wrapper for carbon.now.sh """ await e.edit("🌚🌚🌚🌚🌚") CARBON = 'https://carbon.now.sh/?bg=rgba(29%2C40%2C104%2C1)&t=one-light&wt=none&l=application%2Ftypescript&ds=true&dsyoff=20px&dsblur=68px&wc=true&wa=true&pv=56px&ph=56px&ln=false&fl=1&fm=Hack&fs=14px&lh=143%25&si=false&es=2x&wm=false&code={code}' CARBONLANG = "en" textx = await e.get_reply_message() pcode = e.text if pcode[8:]: pcode = str(pcode[8:]) elif textx: pcode = str(textx.message) # Importing message to module code = quote_plus(pcode) # Converting to urlencoded url = CARBON.format(code=code, lang=CARBONLANG) chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.binary_location = Config.GOOGLE_CHROME_BIN chrome_options.add_argument("--window-size=1920x1080") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('--disable-gpu') prefs = {'download.default_directory': './'} chrome_options.add_experimental_option('prefs', prefs) await e.edit("🌝🌝🌚🌚🌚") driver = webdriver.Chrome(executable_path=Config.CHROME_DRIVER, options=chrome_options) driver.get(url) download_path = './' driver.command_executor._commands["send_command"] = ( "POST", '/session/$sessionId/chromium/send_command') params = { 'cmd': 'Page.setDownloadBehavior', 'params': { 'behavior': 'allow', 'downloadPath': download_path } } command_result = driver.execute("send_command", params) driver.find_element_by_xpath( "//button[contains(text(),'Export')]").click() sleep(5) # this might take a bit. driver.find_element_by_xpath("//button[contains(text(),'4x')]").click() sleep(5) await e.edit("🌝🌝🌝🌚🌚") driver.find_element_by_xpath( "//button[contains(text(),'PNG')]").click() sleep(5) #Waiting for downloading await e.edit("🌝🌝🌝🌝🌝") file = './carbon.png' await e.edit("✅Karbon4 Completed, Uploading Karbon✅") await e.client.send_file( e.chat_id, file, caption="Karbon4 by [@NoOne](https://t.me/kirito6969)", force_document=True, reply_to=e.message.reply_to_msg_id, ) os.remove('./carbon.png') # Removing carbon.png after uploading await e.delete() # Deleting msg
def main(): start_time = datetime.now() n = 0 while True: sleep(5) print("Перезапуск браузера") chromeOptions = Options() chromeOptions.add_argument('--headless') chromeOptions.add_argument('--no-sandbox') prefs = {"profile.managed_default_content_settings.images": 2} chromeOptions.add_experimental_option('prefs', prefs) driver = webdriver.Chrome(chrome_options=chromeOptions) # driver = webdriver.Chrome(options=chromeOptions) driver.get('https://www.w3.org/People/mimasa/test/') while True: try: product_ = MySQL().get_product_reviews() if product_: sku = product_['sku'] id_product = product_['id'] id_category = product_['id_category'] # Преобразовываем URL link_ = product_['link_product'] if link_.endswith('/'): link = link_ + 'otzyvy/' else: link = link_ + '/otzyvy/' # Забираем страницу и ждём окончания её загрузки driver.get(link) tmp = '' while True: page_source = driver.page_source if len(page_source) > len(tmp): tmp = page_source else: break reviews = get_reviews(sku=sku, page_source=page_source) # print(reviews) if reviews: MySQL().write_reviews(reviews=reviews) MySQL().set_product_reviews_ready( id_product=id_product) elif reviews is False: MySQL().set_product_reviews_bad(id_product=id_product) else: MySQL().set_product_reviews_ready( id_product=id_product) n += 1 print('\rCount', n, 'Time', datetime.now() - start_time, end='') except: break driver.close() driver.quit()
def getData(request): url = "https://918hj.zjlbw.top/" chrome_options = Options() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options) driver.get(url) iframe = driver.find_elements_by_tag_name('iframe')[0] # 查找第一个[0]iframe driver.switch_to.frame(iframe) # 查找进入刚刚查找的iframe里面 iframe = driver.find_elements_by_tag_name('iframe')[0] # 重复 driver.switch_to.frame(iframe) # 重复 soup = BeautifulSoup(driver.page_source, "html.parser") # 解码 soup_dl = soup.find_all("dl") legend_list = [] howMany = 0 for index, dl in enumerate(soup_dl): temp_list = [] howMany += 1 for a in dl.find_all("a"): temp_list.append(a.string) for b in dl.find_all("span"): if b.string == None or '': for bb in b.find_all("font"): if bb.string == None or '': continue else: temp_list.append(bb.string) else: temp_list.append(b.string) try: spanIsNone = dl.span.string except: for index, c in enumerate(dl.find_all("dd")): if index >= 2 and index <= 5: if c.string == None or '': for index, cc in enumerate(c.strings): if index == 0: temp_list.append(cc) else: continue # c5 = c.parent.find(class_="c5") # # print("c.父节点", c.parent.find(class_="c5")) else: temp_list.append(c.string.replace("\xa0", "")) temp_list.append(dl.a["href"]) del temp_list[2] # 把多余的删除 legend_list.append(temp_list) for legend in legend_list: print("dl:", legend) # spli = legend[2].split("/") # month = spli[0][0] # day = spli[1][0:2] # hour = spli[2][0] # minute = spli[2][2:4] # year = time.strftime('%Y', time.localtime(time.time())) # dd = "%s-%s-%s %s:%s:0" % (year, month, day, hour, minute) #如果时间是空的,就给个假时间给它 if len(legend) < 4 or len(legend) >= 8: print("警告,列表元素小于3或大于7") continue if legend[2] == None or '': temp_month = time.strftime('%m', time.localtime(time.time())) temp_day = time.strftime('%d', time.localtime(time.time())) temp_legend = "%s月/%s日/★错误时间★" % (temp_month, temp_day) else: temp_legend = legend[2] spli = temp_legend.split("/") if spli[0] == "---精品全天固定---": onPage = "allDay" ttime = time.strftime('%Y-%m-%d 0:0:0', time.localtime(time.time())) dd = ttime legendSite.objects.create(serverName=legend[0], ip=legend[1], time=dd, type=legend[3], introduce=legend[4], QQ=legend[5], href=legend[6], onPage=onPage) elif spli[-1] == "★通宵推荐★": onPage = "allNight" ttime = time.strftime('%Y-%m-%d 0:0:0', time.localtime(time.time())) dd = ttime legendSite.objects.create(serverName=legend[0], ip=legend[1], time=dd, type=legend[3], introduce=legend[4], QQ=legend[5], href=legend[6], onPage=onPage) elif spli[-1] == "★错误时间★": onPage = "error" ttime = time.strftime('%Y-%m-%d 0:0:0', time.localtime(time.time())) dd = ttime legendSite.objects.create(serverName=legend[0], ip=legend[1], time=dd, type=legend[3], introduce=legend[4], QQ=legend[5], href=legend[6], onPage=onPage) else: try: onPage = "normal" year = time.strftime('%Y', time.localtime(time.time())) month = spli[0].replace("月", '') day = spli[1].replace("日", '') hour = spli[2].split("点")[0] minute = spli[2].split("点")[1].split("开放")[0].replace('分', '') minute = minute if minute != '' else 0 dd = "%s-%s-%s %s:%s:0" % (year, month, day, hour, minute) legendSite.objects.create(serverName=legend[0], ip=legend[1], time=dd, type=legend[3], introduce=legend[4], QQ=legend[5], href=legend[6], onPage=onPage) except: onPage = "allNight" ttime = time.strftime('%Y-%m-%d 0:0:0', time.localtime(time.time())) dd = ttime legendSite.objects.create(serverName=legend[0], ip=legend[1], time=dd, type=legend[3], introduce=legend[4], QQ=legend[5], onPage=onPage)
def __init__(self): chrome_options = Options() chrome_options.add_argument('--lang=pt-BR') self.driver = webdriver.Chrome( executable_path=r'./chromedriver.exe', options=chrome_options)
import time from selenium.webdriver.chrome.options import Options import os, sys #deciding folder name if len(sys.argv) > 1: default_folder_name = sys.argv[1] else: default_folder_name = 'Song' download_path = "C:/Users/gaurav.khatri/Downloads/" final_path = os.path.join(download_path, default_folder_name) if not os.path.exists(final_path): os.mkdir(final_path) #Global arguments for avoiding Show notification popups option = Options() option.add_argument("--disable-infobars") option.add_argument("start-maximized") option.add_argument("--disable-extensions") option.add_experimental_option( "prefs", { "profile.default_content_setting_values.notifications": 2, "download": { 'default_directory': final_path } }) def check_download_completion(wait=False): download_path = "C:/Users/gaurav.khatri/Downloads"
def __init__(self, path=None, headless=True): self.browser_options = Options() self.browser_options.add_argument('--disable-extensions') if headless: self.browser_options.add_argument('--headless') self.browser = webdriver.Chrome(path, options=self.browser_options)
def __init__(self): options = Options() options.set_headless(True) self.driver = webdriver.Chrome(chrome_options=options)
def setup(self): options = Options() options.debugger_address = "127.0.0.1:9222" self.driver = webdriver.Chrome(options=options)
def check_res(): data = openpyxl.load_workbook('./it_data1.xlsx', 'r') sheet = data.active print("\n\tDDU IT Student Result Checker") print("Enter the name of student : ", end=" ") name = input() name = name.upper() name = name.strip() flag = 0 for rowNum in range(2, 130): DataName = sheet.cell(row=rowNum, column=2).value #print(produceName) if (DataName.find(name) != -1): flag = 1 Id = sheet.cell(row=rowNum, column=1).value Id = Id.strip() Dob = sheet.cell(row=rowNum, column=3).value if (isinstance(Dob, datetime.datetime)): Dob = Dob.strftime('%m/%d/%Y') Dob = Dob.strip() d = Dob.split('/') if (len(d[0]) == 1): d[0] = '0' + d[0] if (len(d[1]) == 1): d[1] = '0' + d[1] if (len(d[2]) == 2): d[2] = '19' + d[2] Dob = '/'.join(d) print("Searching Result...") print() print("Name : " + DataName) print("Id : " + Id) print("Dob : " + Dob) break if (flag == 0): print("No such student exists...") if (flag): sleep(2) chrome_options = Options() chrome_options.add_argument("disable-infobars") driver = webdriver.Chrome( executable_path= "C:/Users/Admin/PycharmProjects/Results/chromedriver.exe", chrome_options=chrome_options) driver.get("https://egov.ddit.ac.in/index.php?r=site/login") uid = driver.find_element_by_id("LoginForm_username") pswd = driver.find_element_by_id("LoginForm_password") uid.send_keys(Id) pswd.send_keys(Dob) #print(driver.current_url) while (driver.current_url != "https://egov.ddit.ac.in/index.php?r=studentInformation/studentInfo" ): sleep(2) driver.get( "https://egov.ddit.ac.in/index.php?r=tblstudentmst/academicHistory" ) sleep(1) driver.find_element_by_id('yt10').click() print()
def QRLogin(Dictionary): options = Options() # ヘッドレスブラウザ指定 # options.add_argument('--headless') options.add_argument('--disable-gpu') # ChromeDriverのパスとオプションをつけてwebdriverを作成 # this path is for keigo's windows PC # driver = webdriver.Chrome( # 'C:\\Users\\keigo\\chromedriver', options=options) # this path is passed by installing selenium and chromedriver_binary # detail will be in this URl # https://qiita.com/memakura/items/20a02161fa7e18d8a693 # take care of the version of chromedriver_binary, should be same or near to your chrome driver = webdriver.Chrome(options=options) driver.get( 'https://portal.nap.gsic.titech.ac.jp/GetAccess/Login?Template=userpass_key&AUTHMETHOD=UserPassword' ) # time.sleep(1) # あなたのユーザー名/メールアドレス #username = '******' # あなたのパスワード #password = '******' # あなたのユーザー名/メールアドレス gakuseki = Dictionary['gakuseki'] # あなたのパスワード password = Dictionary['PW'] # ユーザー名の入力ボックスを探す F12を押してhtmlを出力し、xpathをここに入力する username_box = driver.find_element_by_xpath( "/html/body/center[3]/form/table/tbody/tr/td/table/tbody/tr[2]/td/div/div/input" ) # パスワードの入力ボックスを探す password_box = driver.find_element_by_xpath( "/html/body/center[3]/form/table/tbody/tr/td/table/tbody/tr[3]/td/div/div/input" ) # ユーザ名とパスワードをインプットする username_box.send_keys(gakuseki) password_box.send_keys(password) # ログインボタンを探す login_button = driver.find_element_by_xpath( "/html/body/center[3]/form/table/tbody/tr/td/table/tbody/tr[5]/td/input[1]" ) # ログインボタンをクリック login_button.click() # time.sleep(1) m1 = youso(driver, "//*[@id=\"authentication\"]/tbody/tr[4]/th[1]") m2 = youso(driver, "//*[@id=\"authentication\"]/tbody/tr[5]/th[1]") m3 = youso(driver, "//*[@id=\"authentication\"]/tbody/tr[6]/th[1]") matrix1_box = driver.find_element_by_xpath( "//*[@id=\"authentication\"]/tbody/tr[4]/td/div/div/input") matrix1_box.send_keys(Dictionary[m1]) matrix2_box = driver.find_element_by_xpath( "//*[@id=\"authentication\"]/tbody/tr[5]/td/div/div/input") matrix2_box.send_keys(Dictionary[m2]) matrix3_box = driver.find_element_by_xpath( "//*[@id=\"authentication\"]/tbody/tr[6]/td/div/div/input") matrix3_box.send_keys(Dictionary[m3]) # ログインボタンを探す OK = driver.find_element_by_xpath( "//*[@id=\"authentication\"]/tbody/tr[8]/td/input[1]") # ログインボタンをクリック OK.click() time.sleep(1) # スクショ用 # driver.save_screenshot('screenshot.png') # ブラウザを終了 # driver.quit() return "Worked it"
def spr_scrape_postpaid_tablet_prices(): # go to website chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920x1080") chrome_driver = os.getcwd() + "\\chromedriver.exe" driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver) driver.get( 'https://www.sprint.com/en/shop/tablets.html?INTNAV=TopNav:Shop:Tablets&credit=A2&sort=FEATURED' ) time.sleep(5) # get soup html = driver.page_source soup = BeautifulSoup(html, "html.parser") driver.close() # make scraper object scraped_postpaid_price = ScrapedPostpaidPrice() # set hardcoded variables scraped_postpaid_price.provider = 'sprint' scraped_postpaid_price.date = datetime.date.today() scraped_postpaid_price.time = datetime.datetime.now().time() # iterate through devices on landing page for device_tile in soup.findAll( 'li', class_='col-xs-24 col-sm-12 col-lg-8 text-center device-tile'): # get device name text device_name = device_tile.find("h3", { "class": "font-size-18 line-height-24 font-normal my-0 align-left" }).text.strip().lower() # eliminate out of scope devices if device_name.find("linelink") != -1 or device_name.find("pre-owned") != -1 or device_name.find("flip") != -1 \ or device_name.find("sim") != -1 or device_name.find("duraxtp") != -1 or device_name.find("duratr") != -1 \ or device_name.find("xp strike") != -1 or device_name.find("certified") != -1: continue # device name scraped_postpaid_price.device = device_parser(device_name) # url scraped_postpaid_price.url = "https://www.sprint.com" + device_tile.find( "a")["href"] # promo text for device landing page & add to database try: promo_text = device_tile.find("span", { "class": "color--purple font-size-14" }).text.strip() except AttributeError: promo_text = '' add_scraped_promotions_to_database(scraped_postpaid_price.provider, scraped_postpaid_price.device, '0', 'device landing page', promo_text, scraped_postpaid_price.url, scraped_postpaid_price.date, scraped_postpaid_price.time) # go to url chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920x1080") chrome_driver = os.getcwd() + "\\chromedriver.exe" driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver) driver.implicitly_wait(5) driver.get(scraped_postpaid_price.url) time.sleep(5) html = driver.page_source device_soup = BeautifulSoup(html, "html.parser") # if 404 error, stop program site_title = device_soup.find_all("title") if '404' in str(site_title): print('404 Error: ' + scraped_postpaid_price.device) continue # click on drop down menu and record device sizes size_selector = driver.find_element_by_id('sprint_storage_selector') size_selector.click() time.sleep(2) sizes = size_selector.text.strip().replace(' GB', '') sizes = sizes.split('\n') # iterate through sizes for size in sizes: # click on size and reload page select = Select( driver.find_element_by_id('sprint_storage_selector')) select.select_by_value(size) time.sleep(2) html = driver.page_source device_soup = BeautifulSoup(html, "html.parser") # record device size scraped_postpaid_price.storage = size # initialize price variables scraped_postpaid_price.monthly_price = '0.00' scraped_postpaid_price.retail_price = '0.00' scraped_postpaid_price.onetime_price = '0.00' # get prices for label in device_soup.findAll('label', class_='soar-selection__label'): if label.find('strong' ).text == ' Buy it with 24 monthly installments': monthly = label.findAll('span', class_='display-block') scraped_postpaid_price.monthly_price = price_parser( monthly[0].text.strip()) scraped_postpaid_price.onetime_price = price_parser( monthly[1].text.strip()) if label.find('strong').text == ' Full price': retail = label.findAll('span', class_='display-block') scraped_postpaid_price.retail_price = price_parser( retail[1].text.strip()) # add to database remove_postpaid_duplicate(scraped_postpaid_price.provider, scraped_postpaid_price.device, scraped_postpaid_price.storage, scraped_postpaid_price.date) add_postpaid_to_database(scraped_postpaid_price.provider, scraped_postpaid_price.device, scraped_postpaid_price.storage, scraped_postpaid_price.monthly_price, scraped_postpaid_price.onetime_price, scraped_postpaid_price.retail_price, scraped_postpaid_price.contract_ufc, scraped_postpaid_price.url, scraped_postpaid_price.date, scraped_postpaid_price.time) spr_scrape_postpaid_promotions(device_soup, scraped_postpaid_price.url, scraped_postpaid_price.device, scraped_postpaid_price.storage) driver.quit()
def show_reviews(data, index): index -= 1 d = data[index] # print(d) link = d[4] link = "https://www.flipkart.com/" + link # print(link) chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920x1080") driver = webdriver.Chrome(options=chrome_options, executable_path="D:\chromedriver.exe") driver.get(link) review_rating = int(input("Review rating ")) # swINJg _3nrCtb e = [] count = 0 time.sleep(1) while len(e) <= 0 and count <= 20: count += 1 try: e = driver.find_elements_by_xpath("//div[@class='swINJg _3nrCtb']") except: continue if len(e) > 0: e[0].click() time.sleep(1) read_more = [] count = 0 while len(read_more) <= 0 and count <= 20: #and count<=20: count += 1 try: read_more = driver.find_elements_by_xpath( "//span[@class='_1EPkIx']") except: continue if len(read_more) > 0: for r in read_more: r.click() try: read_more[len(read_more) - 1].click() except: pass print(len(read_more)) time.sleep(1) # link=driver.current_url # source=requests.get(link).text # soup=BeautifulSoup(source,"html5lib") # //div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12'][2]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='hGSR34 E_uFuv'] count = len( driver.find_elements_by_xpath( "//div[@class='col _390CkK _1gY8H-']")) for i in range(2, count + 2): rating = driver.find_element_by_xpath( "//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12'][" + str(i) + "]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='hGSR34 E_uFuv']" ).text print(rating) if int(rating) >= review_rating: review = driver.find_element_by_xpath( "//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12'][" + str(i) + "]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='qwjRop']//div[@class]" ).text title = driver.find_element_by_xpath( "//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12'][" + str(i) + "]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/p[@class='_2xg6Ul']" ).text print("Rating : " + str(rating)) print(title) print() print(review) print() else: count = 0 while len(e) <= 0 and count <= 20: count += 1 try: e = driver.find_elements_by_xpath( "//div[@class='swINJg _3cycCZ']") except: continue e[0].click() time.sleep(1) read_more = [] count = 0 while len(read_more) <= 0 and count <= 20: #and count<=20: count += 1 try: read_more = driver.find_elements_by_xpath( "//span[@class='_2jRR3v']") except: continue if len(read_more) > 0: for r in read_more: r.click() try: read_more[len(read_more) - 1].click() except: pass print(len(read_more)) time.sleep(1) count = len( driver.find_elements_by_xpath( "//div[@class='col _390CkK _1gY8H- _2675cp']")) for i in range(2, count + 2): rating = driver.find_element_by_xpath( "//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12'][" + str(i) + "]/div[@class='_1PBCrt _26FBOm']/div[@class='col']/div[@class='col _390CkK _1gY8H- _2675cp']/div[@class='row']/div[@class='qwjRop _2675cp']/div" ) print(rating) # if int(rating)>=review_rating: # review=driver.find_element_by_xpath("//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["+str(i)+"]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='qwjRop']//div[@class]").text # title=driver.find_element_by_xpath("//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["+str(i)+"]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/p[@class='_2xg6Ul']").text # print("Rating : "+str(rating)) # print(title) # print() # print(review) # print() # path_rating_review="//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12'][10]/div[@class='_1PBCrt _26FBOm']/div[@class='col']/div[@class='col _390CkK _1gY8H- _2675cp']/div[@class='row']/div[@class='qwjRop _2675cp']/div" # lis=soup.select('div[class="col _390CkK _1gY8H-"]') # lis2=[len(lis)] # rating_data=[] # count=0 # for i in lis: # if '"hGSR34 E_uFuv"' in str(i): # rate=str(i.select('div[class="hGSR34 E_uFuv"]')) # rate=rate[0:rate.index("img")-1] # rate=int(format_data(rate)) # if rate>=review_rating: # print("Rating "+ str(rate)) # l=[] # l.append(rate) # if '"_2xg6Ul"' in str(i): # title=str(i.select('p[class="_2xg6Ul"]')) # title=title[0:len(title)-5] # title=format_data(title) # print(title) # else: # print("No title") # # if '"qwjRop"' in str(i): # review=driver.find_element_by_xpath("//div[@class='ooJZfD _2oZ8XT col-9-12']/div[@class='_3gijNv col-12-12']["+str(count+2)+"]/div[@class='_1PBCrt']/div[@class='col']/div[@class='col _390CkK _1gY8H-']/div[@class='row']/div[@class='qwjRop']//div[@class]").text # # review=str(i.select('div[class=""]')) # print(review) # print() # else: # print("No Review") # count+=1 # lis2.append(i) while True: pass
def autoupdate_chromedriver(): driverName = "/chromedriver.exe" # defining base file directory of chrome drivers driver_loc = "C:/Users/fitim/IdeaProjects/PythonProject/" #driver_loc = "C:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python37\\Scripts\\" #-- ENTER the file path of your exe # -- I created a separate folder to house the versions of chromedriver, previous versions will be deleted after downloading the newest version. # ie. version 75 will be deleted after 77 has been downloaded. # defining the file path of your exe file automatically updating based on your browsers current version of chrome. #currentPath = driver_loc + chrome_browser_version + driverName currentPath = driver_loc + driverName # check file directories to see if chrome drivers exist in nextVersion import os.path # check if new version of drive exists --> only continue if it doesn't Newpath = driver_loc + nextVersion match = False driver = webdriver.Chrome() str1 = driver.capabilities['browserVersion'] str2 = driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0] print(str1) print(str2) print(str1[0:2]) print(str2[0:2]) if str1[0:2] != str2[0:2]: print("please download correct chromedriver version") match = True else: print("chrome and driver match") driver.quit() # check if we have already downloaded the newest version of the browser, ie if we have version 76, and have already downloaded a version of 77, we don't need to run any more of the script. newfileloc = Newpath + driverName exists = os.path.exists(newfileloc) if (exists == False and match == True): #open chrome driver and attempt to download new chrome driver exe file. chrome_options = Options() executable_path = currentPath driver = webdriver.Chrome(executable_path=executable_path, options=chrome_options) # opening up url of chromedriver to get new version of chromedriver. chromeDriverURL = 'https://chromedriver.storage.googleapis.com/index.html?path=' + nextVersion driver.get(chromeDriverURL) time.sleep(5) # find records of table rows table = driver.find_elements_by_css_selector('tr') # check the length of the table Table_len = len(table) # ensure that table length is greater than 4, else fail. -- table length of 4 is default when there are no availble updates if (Table_len > 4): # define string value of link rowText = table[(len(table) - 2)].text[:6] time.sleep(1) # select the value of the row driver.find_element_by_xpath('//*[contains(text(),' + '"' + str(rowText) + '"' + ')]').click() time.sleep(1) #select chromedriver zip for windows driver.find_element_by_xpath('//*[contains(text(),' + '"' + "win32" + '"' + ')]').click() time.sleep(3) driver.quit() from zipfile import ZipFile import shutil fileName = r"C:\Users\fitim\Downloads\chromedriver_win32.zip" #--> enter your download path here. # Create a ZipFile Object and load sample.zip in it with ZipFile(fileName, 'r') as zipObj: # Extract all the contents of zip file in different directory zipObj.extractall(Newpath) # delete downloaded file os.remove(fileName) # defining old chrome driver location oldPath = driver_loc + lastVersion oldpathexists = os.path.exists(oldPath) # this deletes the old folder with the older version of chromedriver in it (version 75, once 77 has been downloaded) if (oldpathexists == True): shutil.rmtree(oldPath, ignore_errors=True) if match == False: return "no needed to do update" else: return "Done chromedriver update to version ", nextVersion
# -*- coding: utf-8 -*- """ Created on Thu Jul 16 10:40:22 2020 @author: ADHIRAJ MAJUMDAR """ from selenium import webdriver from bs4 import BeautifulSoup from selenium.webdriver.chrome.options import Options import re import csv import pandas as pd import time options = Options() chromedriver = "chromedriver.exe" browser = webdriver.Chrome(chromedriver, options=options) data = pd.read_excel('Input/input_company.xlsx') with open('Output/IEC_Details_file.csv', 'w', newline='') as outcsv: writer = csv.writer(outcsv) writer.writerow([ "IEC", "IEC_Allotment_Date", "File_Number", "Party_Name_and_Address", "Phone_No", "e_mail", "Exporter_Type", "Date_of_Establishment", "PAN_ISSUE_DATE", "BIN", "PAN_ISSUED_BY", "Nature_Of_Concern", "Bank", "Dirct1", "Dirct2" ]) for index, row in data.iterrows(): browser.get("http://dgft.delhi.nic.in:8100/dgft/IecPrint") IEC = browser.find_element_by_xpath('/html/body/form/input[1]') if len(str(row["IEC"])) <= 9: IEC.send_keys('0' + str(row["IEC"]))
def trackMultipleObjects(video): rectangleColor = (0, 255, 0) frameCounter = 0 currentCarID = 0 fps = 0 carTracker = {} carNumbers = {} carLocation1 = {} carLocation2 = {} speed = [None] * 1000 u=[] z=[] # Write output to video file #out = cv2.VideoWriter('outpy.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 10, (WIDTH,HEIGHT)) while True: start_time = time.time() rc, image = video.read() if type(image) == type(None): break image = cv2.resize(image, (WIDTH, HEIGHT)) resultImage = image.copy() Y_THRESH = 290 cv2.line(resultImage,(0, Y_THRESH),(1280, Y_THRESH),(255,0,0),2) frameCounter = frameCounter + 1 carIDtoDelete = [] for carID in carTracker.keys(): trackingQuality = carTracker[carID].update(image) if trackingQuality < 7: carIDtoDelete.append(carID) for carID in carIDtoDelete: print ('Removing carID ' + str(carID) + ' from list of trackers.') print ('Removing carID ' + str(carID) + ' previous location.') print ('Removing carID ' + str(carID) + ' current location.') carTracker.pop(carID, None) carLocation1.pop(carID, None) carLocation2.pop(carID, None) if not (frameCounter % 10): gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) cars = carCascade.detectMultiScale(gray, 1.1, 13, 18, (24, 24)) for (_x, _y, _w, _h) in cars: x = int(_x) y = int(_y) w = int(_w) h = int(_h) x_bar = x + 0.5 * w y_bar = y + 0.5 * h matchCarID = None for carID in carTracker.keys(): trackedPosition = carTracker[carID].get_position() t_x = int(trackedPosition.left()) t_y = int(trackedPosition.top()) t_w = int(trackedPosition.width()) t_h = int(trackedPosition.height()) t_x_bar = t_x + 0.5 * t_w t_y_bar = t_y + 0.5 * t_h if ((t_x <= x_bar <= (t_x + t_w)) and (t_y <= y_bar <= (t_y + t_h)) and (x <= t_x_bar <= (x + w)) and (y <= t_y_bar <= (y + h))): matchCarID = carID if matchCarID is None: print ('Creating new tracker ' + str(currentCarID)) tracker = dlib.correlation_tracker() tracker.start_track(image, dlib.rectangle(x, y, x + w, y + h)) carTracker[currentCarID] = tracker carLocation1[currentCarID] = [x, y, w, h] currentCarID = currentCarID + 1 #cv2.line(resultImage,(0,480),(1280,480),(255,0,0),5) for carID in carTracker.keys(): trackedPosition = carTracker[carID].get_position() t_x = int(trackedPosition.left()) t_y = int(trackedPosition.top()) t_w = int(trackedPosition.width()) t_h = int(trackedPosition.height()) cv2.rectangle(resultImage, (t_x, t_y), (t_x + t_w, t_y + t_h), rectangleColor, 4) # speed estimation carLocation2[carID] = [t_x, t_y, t_w, t_h] end_time = time.time() if not (end_time == start_time): fps = 1.0/(end_time - start_time) #cv2.putText(resultImage, 'FPS: ' + str(int(fps)), (620, 30),cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2) for i in carLocation1.keys(): if frameCounter % 1 == 0: [x1, y1, w1, h1] = carLocation1[i] [x2, y2, w2, h2] = carLocation2[i] # print 'previous location: ' + str(carLocation1[i]) + ', current location: ' + str(carLocation2[i]) carLocation1[i] = [x2, y2, w2, h2] # print 'new previous location: ' + str(carLocation1[i]) if [x1, y1, w1, h1] != [x2, y2, w2, h2]: if (speed[i] == None or speed[i] == 0) and y1 >= 275 and y1 <= 285: speed[i] = estimateSpeed([x1, y1, w1, h1], [x2, y2, w2, h2]) if int(speed[i])>65: winsound.PlaySound('speed_car_sound.wav', winsound.SND_FILENAME) u.append(speed[i]) cv2.imwrite('speeding_%s.png' % i, resultImage) options = Options() options.add_argument("--use-fake-ui-for-media-stream") timeout = 20 driver = webdriver.Chrome(executable_path = 'D:/project/computervision/garbage detection/pyPushBullet-master/chromedriver_win32/chromedriver.exe', chrome_options=options) driver.get("https://mycurrentlocation.net/") wait = WebDriverWait(driver, timeout) longitude = driver.find_elements_by_xpath('//*[@id="longitude"]') longitude = [x.text for x in longitude] longitude = str(longitude[0]) latitude = driver.find_elements_by_xpath('//*[@id="latitude"]') latitude = [x.text for x in latitude] latitude = str(latitude[0]) driver.quit() m=(latitude,longitude) num=speed[i] database(num) z.append(m) #if y1 > 275 and y1 < 285: if speed[i] != None and y1 >= 180: cv2.putText(resultImage, str(int(speed[i])) + " km/hr", (int(x1 + w1/2), int(y1-5)),cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2) #winsound.PlaySound('speed_car_sound.wav', winsound.SND_FILENAME) #print ('CarID ' + str(i) + ': speed is ' + str("%.2f" % round(speed[i], 0)) + ' km/h.\n') #else: # cv2.putText(resultImage, "Far Object", (int(x1 + w1/2), int(y1)),cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) #print ('CarID ' + str(i) + ' Location1: ' + str(carLocation1[i]) + ' Location2: ' + str(carLocation2[i]) + ' speed is ' + str("%.2f" % round(speed[i], 0)) + ' km/h.\n') cv2.imshow('result', resultImage) # Write the frame into the file 'output.avi' #out.write(resultImage) key = cv2.waitKey(1) & 0xFF # if the `q` key was pressed, break from the loop if key == ord("q"): break cv2.destroyAllWindows() return u,z
def scrape_yieldwatch(my_address: Optional[str] = None, headless=True, timeout: int = 30): config = read_config() if my_address is None: my_address = config["bsc"]["address"] chrome_options = Options() if headless: chrome_options.add_argument("--headless") with webdriver.Chrome(options=chrome_options) as driver: WebDriverWait(driver, timeout) driver.get("https://www.yieldwatch.net/") for letter in my_address: address_bar = driver.find_element_by_id("addressInputField") address_bar.send_keys(letter) icon_bar = driver.find_element_by_class_name( "centered.bottom.aligned.row") buttons = icon_bar.find_elements_by_class_name("center.aligned.column") for button in buttons: grayscale = button.find_element_by_class_name( "ui.centered.image").value_of_css_property("filter") if grayscale == "grayscale(1)": button.click() button = driver.find_element_by_class_name("binoculars") button.click() # Wait until the next page is loaded element_present = presence_of_element_located( (By.CLASS_NAME, "content.active")) WebDriverWait(driver, timeout).until(element_present) infos = defaultdict(dict) segments = driver.find_elements_by_class_name("ui.segment") for segment in segments: # Many elements have the "ui segment" class, only pick the ones with # the "accordion ui" style. for defi in segment.find_elements_by_class_name("accordion.ui"): boxes = defi.find_elements_by_class_name("ui.equal.width.grid") if not boxes: continue which = defi.text.split("\n")[0] for box in boxes: header, content = box.find_elements_by_class_name("row") header_text = header.text.split("\n") box_name = header_text[0] dollar_value = header_text[1] assert "$" in dollar_value dollar_value = float( dollar_value.replace(",", "").replace("$", "")) # Get the columns in the box, only the first two are relevant columns = content.find_elements_by_class_name( "collapsing.right.aligned") names = columns[0].text.split("\n") amounts = columns[1].text.split("\n") d = defaultdict(list) for i, amount in enumerate(amounts): amount, coin = amount.split(" ", 1) name = names[min(i, len(names) - 1)] amount = (float(amount[:-1]) * 1000 if "k" in amount else float(amount)) d[name].append((amount, coin)) d = dict(d) d["dollar_value"] = dollar_value infos[which][box_name] = dict(d) return dict(infos)
class DmozSpider4(scrapy.Spider): # 继承Spider类 print("进入%s了!!!!!!!!!" % num) import os if os.path.exists('output'): shutil.rmtree('output') yuming = '中国青年' lang = '英语' ''' 超参数都在这里修改, 就下面这2个有用.name 随便起一个,在main函数里面调用这个名就行. html就是要爬取的网站. ''' name = "dmoz%s" % num # 爬虫的唯一标识,不能重复,启动爬虫的时候要用 print("name", name) # html='http://www.171english.cn/news/' # html='http://www.171english.cn/news/2018' # html='http://www.171english.cn/news/2019' html = 'http://www.kantsuu.com/cnjp/List_1408.shtml' # html=' http://roll.edu.sina.com.cn/english/syxw/ss4/index_5.shtml' html = html.strip() from bs4 import BeautifulSoup #首页写这里 baseUrl = html import requests # a=requests.get(html).content # bs = BeautifulSoup(a, "html.parser") # 缩进格式 # print(bs) # 下面冲bs中找到所有爬取的页. # print(bs.find_all("a")) # 获取所有的a标签,也就是超链接 from selenium import webdriver import sys # browser = webdriver.Firefox() # Get local session of firefox # aaa=browser.get("http://news.sina.com.cn/c/2013-07-11/175827642839.shtml ") # Load page # print(aaa) saveall = [html] print(777777777777777777777777777777, baseUrl) if 0: #调试用, 一般不用这么跑.这个只是动态js代码需要这么使用而已. 一般网页没有这种方式.这个方式太慢爬虫.但是可以避免不必要的js bug while 1: tmpurl = saveall[-1] from selenium import webdriver from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument("--headless") from .utilsme import driver base_url = tmpurl driver.get(base_url) # 注意这里面结果直接写到deriver里面 # print(driver.page_source) a = driver.page_source bs = BeautifulSoup(a, "html.parser") # 缩进格式 # print(bs) # 下面冲bs中找到所有爬取的页. # print(bs.find_all("a")) import re # tmp=bs.find_all(text=re.compile("Next[ ]*")) # print(tmp) now = None for s in bs('a'): # print(s.text,444444444444444444444444444444444444444444444444) if s.text == "»": now = s.extract() # 需要对now进行中文转码 # now=parse.quote(now.get('href')) print("loook", now) # 注意这种旧网站的编码方式. now = parse.quote(now.get('href'), safe=";/?:@&=+$, ", encoding="gbk") now = 'https://ru.hujiang.com/' + now print(now, "now网页是!!!!!!!!!!") if now == None or now in saveall: #防止循环 break else: saveall.append(now) print(saveall, '最后获取的所有index页') #-------------推荐的方式获取全部index页 # 下面是直接匹配方式获取所有index页. 也就是一般需求这么跑就可以获取所有index页了. import urllib.request # 导入urllib.request库 if 0: #调试用 while 1: tmpurl = saveall[-1] import urllib from bs4 import BeautifulSoup url = tmpurl print(url, 8989898998) print(repr(url), 9999999999999999999999999999999999999999999999999999) a = urllib.request.urlopen(url) # 打开指定网址 page = a.read() # 读取网页源码 try: page = page.decode('gbk') # 会有2中编码方式. gbk 或者utf-8 except: page = page.decode('utf-8') # 会有2中编码方式. gbk 或者utf-8 print(type(page), 'yuio') # page = requests.get(url) # 开不开,一直404. # page = requests.get('http://www.i21st.cn/story/index_1.html') # 开不开,一直404. # page.encoding = 'utf-8' # soup = BeautifulSoup(page,"html.parser") print(page, 3434343434343) bs = BeautifulSoup(page, "html.parser") # 缩进格式 print(bs, 999999999999999999999999999999999999) # print(bs) # 下面冲bs中找到所有爬取的页. # print(bs.find_all("a")) import re # tmp=bs.find_all(text=re.compile("Next[ ]*")) # print(tmp) now = None print(url, bs('a'), 'uiop') for s in bs('a'): print(s.text, 'yyyyyyyyyy') if s.text == "下一页": now = s.extract() print(now, 12345) # 需要对now进行中文转码 # now=str(now) print(now, 888888888888888888888888) # now=parse.quote(re.findall(r'href=".*"',now)[0]) print("loook", now) # 注意这种旧网站的编码方式. now = parse.quote( now.get('href'), safe=";/?:@&=+$%, ", encoding="gbk" ) # 中文的处理方式是里面加上%即可!!!!!!!!!!!!!!!!!!!!!! print(89898934392423423, now) if now[0] == '.': now = now[2:] now = now # now=r'https://' + 'jp.hjenglish.com'+now print(now, "now网页是!!!!!!!!!!") if now == None: break else: # print(now,556565656565) saveall.append(now) print("我们通过普通index算法得到所有的index页信息是", saveall) # 直接修改这里面!!!!!!!!!!!!! ,可以手动的吧上面的到的saveall直接改下面即可.就得到了全爬虫. saveall = [ #'http://www.171english.cn/news/2018/june/', html, ] start_urls = saveall # 开始爬取的链接 start_urls必须用这个名. def parse(self, response): # 一级爬取代码 print("进入了一级爬虫") #xpath教学:https://blog.csdn.net/qq_27283619/article/details/88704479 #https://www.cnblogs.com/wt7018/p/11749778.html # @表示属性 # 好像使用框架scrapy没法debug.只能疯狂print了 # help(response.url) print(response.url, 77777777777777777777777777777777777777777777777777) print(response, '**********************当前爬取的网页链接') div_list = response.xpath('//td[@class="lbxx"]//a/@href') # 加入正则 # div_list = response.xpath('//div[@class="module cl xl"]/ul/li') # 加入正则 # print(85654645654, div_list) div_list = [i.extract() for i in div_list] # 去掉调回的情况. div_list = [i for i in div_list if i != response.url] div_list = list(set(div_list)) print(85654645654, div_list) # div_list = response.xpath('//div[@class="newslist solid"]') # 加入正则 # print(90909090,div_list) # print(div_list) # print(div_list[0]) # print(div_list[-1]) # print((div_list)) # print(div_list,99999999999999999999999999999999999999) for i in div_list: # print(self.baseUrl+i.extract())# 获得了全部链接,进入二级爬虫. item = en_youth() item['link'] = i item['link'] = item['link'] # print(item['link'],"lianjie !!!!!!!!!!!!!!!!!!!!!!") #每一次一级爬虫得到的页面,都触发一次二级爬虫. yield scrapy.Request(item['link'], callback=self.parse_detail, meta={'item': item}, encoding='raw_unicode_escape') #https://blog.csdn.net/Light__1024/article/details/88763541 如何进行爬取二级界面 def parse_detail(self, response): # 二级爬取代码 infomation = response.meta['item']['link'] # print(infomation,988776754456435345435345435) print(infomation, "二级爬取的地址是") item = response.body # print(item,9090909090909090909090909090) # print(item,444444444444444444444444444444444444) # print(item) # print(response.body,"???????????????") # print("********打印二次爬虫结果")#[@class="TRS_Editor"] item = en_youth() print('进入2极品宠') # 预过滤: 改了body,但是还是不生效.?? # # # response.body="dfadsf" # # tmp=re.sub(r'<script.*</script>','',str(response.body)) # print(tmp,6666666666666666666666666666666666666666) # response._set_body(tmp.encode(response.encoding)) # print(response.body,777777777777777777777777777777777777777777777) # print(response.body,88888888888888888888888888888888888) # HtmlResponse.replace() # HtmlResponse.replace('body',remove_tags_with_content(response.body, 'script')) # HtmlResponse.replace('body',remove_tags_with_content(response.body, 'script')) # tmp2=response.xpath('//td[@class="e14"]//text()').extract() #下面要设计多重xpath判断.因为格式不同意. # 下面这个是只有div 里面写没有p标签. # 如果要提取这个标签里面的不管多深的全部文本, 就不用写细节了.直接div extract就可以实现! # item['neirong']= response.xpath('//div//p').extract() # print( item['neirong'],33333333333333333333333333333333333333333333333333333333333) item['neirong'] = response.xpath('//tr//p').extract() # item['neirong']+= response.xpath('//div[@class="content"]//p').extract() # item['neirong']+= response.xpath('//div[@id="article"]//p').extract() # item['neirong']+= response.xpath('//td[@class="e14"]').extract() # item['neirong']+= response.xpath('//td[@id="article_content"]').extract() # print(item['neirong'],22222222222222222222222) save = [] item['neirong'] = [i for i in item['neirong'] if '<script' not in i] print('tttt', item['neirong']) item['neirong'] = [replace_tags(i, '') for i in item['neirong']] print('neirong2222222222222', item['neirong']) # item['neirong']+= response.xpath('//div[@id="article"]/div/p/text()').extract() # item['neirong']+= response.xpath('//div[@id="article"]/p/text()').extract() # 下面进行脚本滤过. # item['neirong'] = filter(lambda x: '<script>'not in x, item['neirong']) # print(item['neirong'], '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') # print(item['neirong'], 8888888888888888888) save2 = '\n'.join(item['neirong']) print(save2, 9999999999999999999999999999999999999) item['neirong'] = save2 item['title'] = infomation yield item # 下面学习pipeline, 进行文件读写. # setttings里面设置pipeline写入文件 #https://www.cnblogs.com/python2687806834/p/9836935.html pass # # if __name__=="__main__": # DmozSpider()