def edge_driver(request: "SubRequest") -> Union[Remote, Edge]: """Fixture for receiving selenium controlled Edge instance""" if request.cls.test_type == "edge-local": driver = Edge() else: executor = RemoteConnection(SAUCE_HUB_URL, resolve_ip=False) driver = Remote(desired_capabilities=SAUCE_EDGE, command_executor=executor) set_selenium_driver_timeouts(driver) request.cls.driver = driver yield driver driver.close()
def livingly(): url = 'https://www.livingly.com/runway/Milan+Fashion+Week+Fall+2019/Aigner/Details/browse' driver = Edge(executable_path=PATH) action_chains = ActionChains(driver) driver.get(url) WebDriverWait(driver, 5000).until(expected_conditions\ .visibility_of_element_located((By.CLASS_NAME, 'thumbnail-strip'))) content = driver.find_element_by_xpath('//ul[@class="thumbnail-strip"]') links = content.find_elements_by_tag_name('a') # Store the links beforehand because Selenium does # not update the driver with the new content paths = (link.get_attribute('href') for link in links) for path in paths: # link.click() driver.get(path) WebDriverWait(driver, 3000).until(expected_conditions\ .visibility_of_element_located((By.CLASS_NAME, 'region-image'))) try: slideshow = driver.find_element_by_xpath('//div[@class="slideshow-img-link"]') except Exception: driver.execute_script('window.history.go(-1);') else: if slideshow.is_displayed(): big_image_url = slideshow.find_element_by_tag_name('img').get_attribute('data-zoom-url') if big_image_url: # driver.get(big_image_url) driver.execute_script(f'window.open("{big_image_url}", "_blank");') # This part will right click on the image, # download it locally image = driver.find_elements_by_tag_name('img') action_chains.context_click(image).perform() # This section gets all the tabs in the # browser, closes the newly opened image # tab and returns the previous one driver.switch_to_window(driver.window_handles[1]) driver.close() driver.switch_to_window(driver.window_handles[0]) # Wait a couple of seconds before returning # going back in history driver.execute_script('window.history.go(-1);') else: driver.execute_script('window.history.go(-1);') driver.close()
def spider_hero(): url = "https://pvp.qq.com/web201605/herolist.shtml" browser = Edge( executable_path= 'C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe') browser.get(url) ls = browser.find_elements_by_css_selector( "body > div.wrapper > div > div > div.herolist-box > div.herolist-content > ul > li" ) hero_name = [] for i in ls: hero_name.append(i.text) browser.close() with open("hero_name.txt", 'w', encoding="utf-8") as f: for i in hero_name: f.write(i) f.write('\n') print("写入完毕")
def spider_equipment(): url = "https://pvp.qq.com/web201605/item.shtml" browser = Edge( executable_path= 'C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe') browser.get(url) ls = browser.find_elements_by_css_selector("#Jlist-details > li") equip_name = [] for i in ls: equip_name.append(i.text) browser.close() with open("equipment_name.txt", 'w', encoding="utf-8") as f: for i in equip_name: f.write(i) f.write('\n') print("写入完毕") # spider_hero() # spider_equipment()
def getcomponies(): """ Get Companies from web and write to excel file :return: """ _bases.kill_web_driver_edge() driver = Edge() componies = [] driver.get('https://www.dosab.org.tr/Alfabetik-Firmalar-Listesi') # Get links # links = [] # datalinks = driver.find_elements(By.XPATH, '/html/body/div[2]/div/ul/li/div/a') # for link in datalinks: # linkobj = { # 'link': link.get_attribute('href'), # 'name': link.text # } # links.append(linkobj) # Downlaod Mail Images # for complink in componies: # parsedlink = str(complink['link']).split('/') # mailimg = f'https://www.dosab.org.tr/dosyalar/emailler/{parsedlink[4]}_EMail.jpg' # wget.download(mailimg, "imgs") # OCR Image to text pytesseract.pytesseract.tesseract_cmd = r'C:\Users\abdul\AppData\Local\Tesseract-OCR\tesseract.exe' imgfiles = os.listdir('imgs') imgfiles.sort() for imgfile in imgfiles: compid = imgfile.split('_EMail.jpg')[0] driver.get(f'https://www.dosab.org.tr/Firma/{compid}') compname = driver.find_element(By.XPATH, '/html/body/div[2]/div/div[2]/h4').text img = cv2.imread(f'imgs/{imgfile}') emailtext = str(pytesseract.image_to_string(img, lang='eng')).replace( '\n\f', '') if '@' not in emailtext: emailtext = '' company = {'mail': emailtext, 'name': compname} componies.append(company) workbook = Workbook(excel_file_name) worksheet = workbook.add_worksheet('dosab') row = 0 hformat = workbook.add_format() hformat.set_bold() worksheet.write(row, 0, "Firma Adi", hformat) worksheet.write(row, 1, 'Mailler', hformat) row += 1 for comp in componies: worksheet.write(row, 0, comp["name"]) if '@' in comp['mail']: worksheet.write(row, 1, comp['mail']) row += 1 workbook.close() driver.close()
#"87073644", #"86963282", # "87052262", # "87033492", # "87073632", # "87033441", # "86965699", # "86965230") # s.add_artical("85566269","86847612") #s.add_author("35692440",slice(4)) # s.add_author("16778114",slice(0,None)) s.add_author("6657532", slice(12)) cookie = driver.get_cookies() driver.close() s.start() with open("cookies.json", "w") as target: json.dump(cookie, target, indent=4) { "GET": { "scheme": "https", "host": "117-27-114-202.mcdn.bilivideo.cn:480", "filename": "/upgcxcode/79/81/262668179/262668179-1-30080.m4s", "query": { "expires": "1607517453", "platform": "pc", "ssig": "stFfmYCY-VzQJJhIhQJUaw",
def getcomponies(): """ Get Companies from web and write to excel file :return: """ _bases.kill_web_driver_edge() driver = Edge() componies = [] driver.get('https://www.nosab.org.tr/firmalar/tr') alphabetslinks = [] for links in driver.find_elements(By.XPATH, '//*[@id="accordion-2"]/li/a'): link = { 'Sector': links.text, 'Name': links.get_attribute('href') } alphabetslinks.append(link) for anchor in alphabetslinks: driver.get(anchor['Name']) companies_sector = { 'Sector': anchor['Sector'], 'comps': [] } componies_count = len(driver.find_elements(By.XPATH, '/html/body/div[7]/div/div[2]/div[3]/ul/li/a')) for indx in range(1, componies_count + 1): comp = driver.find_element(By.XPATH, f'/html/body/div[7]/div/div[2]/div[3]/ul/li[{indx}]/a') comp.click() companies_sector['Sector'] = anchor['Sector'] company = { 'Name': driver.find_element(By.XPATH, '/html/body/div[7]/div/div[2]/div[1]/div').text, 'Data': str(driver.find_element(By.XPATH, '/html/body/div[7]/div/div[2]/div[4]').text) } companies_sector['comps'].append(company) driver.back() componies.append(companies_sector) row = 0 workbook = Workbook(excel_file_name) worksheet = workbook.add_worksheet('nosab') hformat = workbook.add_format() hformat.set_bold() hformat.set_align('center') hformat.set_align('vcenter') worksheet.write(row, 0, 'Firma Adi', hformat) worksheet.set_column('A:A', 100) worksheet.write(row, 1, 'Bilgileri', hformat) worksheet.set_column('B:B', 120) row += 1 fwarp = workbook.add_format() fwarp.set_text_wrap() fname_centralize = workbook.add_format() fname_centralize.set_align('center') for company in componies: if 'Sector' in company: worksheet.write(row, 0, company['Sector'], hformat) row += 1 if 'comps' in company: for comp in company['comps']: if 'Name' in comp: worksheet.write(row, 0, comp['Name'], fname_centralize) if 'Data' in comp: worksheet.write(row, 1, comp['Data'], fwarp) row += 1 if os.path.exists(excel_file_name): os.remove(excel_file_name) time.sleep(_bases.timeout) workbook.close() driver.close()