def _crawl(detail_link: str, driver: WebDriver) -> Info: driver.get(detail_link) time.sleep(5) accordeon = driver.find_elements_by_class_name('accordeon') try: driver.find_element_by_class_name( 'moove-gdpr-infobar-allow-all').click( ) # get rid of cookie banner blocking all clicks on this website except: pass info = _get_all_present_fields(accordeon, driver) return _convert(info)
def _get_detail_links(driver: WebDriver) -> List[str]: time.sleep(10) # website is very slow detail_links = [] ngo_list = driver.find_element_by_id('toplist') links = ngo_list.find_elements_by_tag_name('a') for link in links: link_href = link.get_attribute("href") detail_links.append(link_href) return detail_links
def _crawl_detail_page(driver: WebDriver) -> List[DetailPageInfo]: detail_pages_infos = [] for idx in range(1, 315): driver.get(URL) rows: List[WebElement] = _get_table_rows(driver) for row in rows: cols = row.find_elements_by_tag_name('td') row_idx: int = extract_idx(cols[0]) if row_idx == idx: link_element = cols[0].find_element_by_tag_name('a') link_element.click() detail_page_info = _extract_detail_info(driver, idx) detail_pages_infos.append(detail_page_info) break _clean_detail_pages(detail_pages_infos) return detail_pages_infos
def ie(path='./IEDriverServer.exe') -> WebDriver: """ IE :param path:IE Driver路径 :return: WebDriver """ dr = IE(executable_path=path) dr.set_page_load_timeout(30) dr.implicitly_wait(10) dr.maximize_window() return dr
def _extract_soft_facts(driver: WebDriver) -> SoftFacts: elements = driver.find_elements_by_tag_name('div') soft_facts_elements = elements[36] soft_facts_list = soft_facts_elements.find_elements_by_tag_name('p') aims = soft_facts_list[0].text.strip() activities = soft_facts_list[1].text.strip() accreditations = soft_facts_list[3].text.strip() areas_of_competence = _split_into_areas(soft_facts_list[7].text.strip()) geographical_representation = _split_into_countries( soft_facts_list[8].text.strip()) return SoftFacts(aims, activities, accreditations, areas_of_competence, geographical_representation)
def _extract_hard_facts(driver: WebDriver, idx: int) -> HardFacts: hard_fact_element = driver.find_elements_by_tag_name('div')[12] website = hard_fact_element.find_elements_by_tag_name( 'span')[1].text.strip() president_element = hard_fact_element.find_elements_by_class_name('row')[4] president_name = president_element.find_element_by_tag_name( 'h4').text.strip() founding_year, staff_number, members_number, languages = _extract_quick_facts( driver, idx) return HardFacts(website, president_name, founding_year, staff_number, members_number, languages)
def getWebDriver(self, browser): print(browser) if browser.upper() == "IE": driver = InternetExplorerDriver() driver.maximize_window() self.logger.appendContent("新建IE驱动") return driver elif browser.upper() == "CHROME": options = ChromeOptions() options.add_argument("test-type") driver = ChromeDriver(chrome_options=options) driver.maximize_window() self.logger.appendContent("新建chrome驱动") return driver elif browser.upper() == "FIREFOX": driver = FirefoxDriverDriver() driver.maximize_window() self.logger.appendContent("新建FireFox驱动") return driver else: return None
def batchAccept(browser: WebDriver): # 批量受理 u = 'http://10.204.14.35/eoms4/sheetBpp/myWaitingDealSheetQueryty.action?baseSchema=WF4_EL_TTM_TTH_EQU&var_pagesize=100' browser.get(u) total_ele = 'form#form1 span.pagenumber' total_element = browser.find_element(By.CSS_SELECTOR, total_ele) total = re.search(r'共([0-9]+)条数据', total_element.text).group(1) if total < 1: return todo_list = browser.find_elements(By.CSS_SELECTOR, 'table#tab tr')[1:] data = [] i = 0 for x in range(0, int(total)): title = todo_list[2 * x + 1].text if title.find('[数据网]') != -1: checkbox = todo_list[2 * x].find_element(By.NAME, 'checkid') checkbox.click() # batchAccept_btn = browser.find_element(By.CSS_SELECTOR, 'li.page_active_button') browser.execute_script('batchAccept();')
def jumpToEOMS(browser: WebDriver, keyword='') -> list: eoms_url = 'http://uip.ln.cmcc/_layouts/Document/BridgeToSPControl.aspx?skipcode=emoss' browser.get(eoms_url) # 故障处理工单(设备): todo_list_url = 'http://10.204.14.35/eoms4/sheetBpp/myWaitingDealSheetQueryGlobalTemplate.action?baseSchema=WF4_EL_TTM_TTH_EQU&var_pagesize=100' browser.get(todo_list_url) total_ele = 'form#form1 span.pagenumber' total_element = browser.find_element(By.CSS_SELECTOR, total_ele) total = re.search(r'共([0-9]+)条数据', total_element.text).group(1) # oo = $('table#tab tr') # Array.prototype.shift.apply(oo) todo_list = browser.find_elements(By.CSS_SELECTOR, 'table#tab tr')[1:] def get_url(title_ele): onclick = title_ele.find_element_by_tag_name('a').get_attribute( 'onclick') args = re.findall(r'\'(.*?)\'', onclick) # browser.execute_script('') url = 'http://10.204.14.35/eoms4/sheet/openWaittingSheet.action?baseSchema={}&taskid={}&baseId={}&entryId=&version=&processType={}' url = url.format(args[0], args[2], args[1], args[3]) # url = 'http://10.204.14.31:8001/bpp/ultrabpp/view.action?baseSchema={}&baseId={}&taskid={}&processType={}' # url = url.format(args[0], args[1], args[2], args[3]) return url data = [] # 待处理工单统计 i = 0 for x in range(0, int(total)): title = todo_list[2 * x + 1] if title.text.find(keyword) != -1: data.append({}) data[i]['title'] = title.text[21:] content = todo_list[2 * x].find_elements_by_tag_name('td') data[i]['end_time'] = datep(content[4].text) data[i]['find_time'] = datep(content[5].text) data[i]['status'] = content[6].text data[i]['url'] = get_url(title) i += 1 # 上清除时间的工单统计 data2 = [] # 未上清除时间统计 data3 = [] j = 0 for x in range(i): browser.get(data[x]['url']) data[x]['clear_time'] = browser.find_element_by_id( 'INC_Alarm_ClearTime').get_attribute('value') title_match = re.search(r'([A-Z\-0-9]+) 上报 (.+)', data[x]['title']) try: data[x]['device'] = title_match.group(1) data[x]['event'] = title_match.group(2) except: pass if data[x]['clear_time'] != '': data2.append(data[x]) j += 1 else: data3.append(data[x]) # TODO 详细推送 msg_title = '{}故障工单 {} 个,已上清除 {} 个。' msg_title = msg_title.format(keyword, len(data), len(data2)) find_time2, find_time3 = [], [] if data2 != []: for x in data2: find_time2.append(datef(x['find_time'])) if data3 != []: for x in data3: find_time3.append(datef(x['find_time'])) msg_text = '### {}\n\n**已上清除建单时间分别为**:\n\n{}\n\n**未上清除建单时间分别为**:\n\n{}\n\n> 推送时间:{}' t2, t3 = '\n\n'.join(find_time2), '\n\n'.join(find_time3) msg_text = msg_text.format(msg_title, t2, t3, datef()) send_msg(msg_markdown(msg_title, msg_text, True)) browser.get(eoms_url) return data, data2, data3
def __init__(self, *args, **kwargs): IeWebDriver.__init__(self, *args, **kwargs) WaitUntil.__init__(self)
def cleanUpCookiesAndLaunchIE(self): caps = DesiredCapabilities.INTERNETEXPLORER ie = WebDriver() ie.delete_all_cookies() ie.get("http://www.yahoo.com") return ie
def __init__(self): """ Create IE Driver Wrapper""" WebDriver.__init__(self) DriverWrapper.__init__(self)
def _get_table_rows(driver: WebDriver) -> List[WebElement]: time.sleep(1) # if we don't wait here, the table might not be loaded yet table = driver.find_element_by_tag_name('table') body = table.find_element_by_tag_name('tbody') rows = body.find_elements_by_tag_name('tr') return rows
def _extract_quick_facts( driver: WebDriver, idx: int ) -> Tuple[Optional[int], Optional[int], Optional[int], List[str]]: founding_year = None staff_number = None members_number = None languages: List[str] = [] possible_quick_facts_elements = driver.find_elements_by_class_name( 'ng-scope') likely_quick_facts_elements = [ p for p in possible_quick_facts_elements if p.find_elements_by_tag_name('strong') ] founded = [ l for l in likely_quick_facts_elements if l.text.startswith('Founded') ] staff = [ l for l in likely_quick_facts_elements if l.text.strip().endswith('staff') ] members = [ l for l in likely_quick_facts_elements if l.text.strip().endswith('members') ] working_languages = [ l for l in likely_quick_facts_elements if l.text.startswith('Working languages') ] if founded: founding_year = int( founded[0].find_element_by_tag_name('strong').text.strip()) if staff: staff_string = staff[0].find_element_by_tag_name('strong').text.strip() staff_string = staff_string.replace(' ', '') try: staff_number = int(staff_string) except: if staff_string == '6fulltime+2interns': staff_number = 6 if staff_string == '2.8': staff_number = 3 if staff_string == '3employeesand7volunteers': staff_number = 3 if staff_string == '5employees': staff_number = 5 print(f'SKIPPING STAFF ({idx}) - {staff_string}') if members: members_string = members[0].find_element_by_tag_name( 'strong').text.strip() members_string = members_string.replace(' ', '') members_string = members_string.replace('+', '') try: members_number = int(members_string) except: if members_string == '384organisations': members_number = 384 if members_string == '50delegations': members_number = 50 if members_string == '120,000': members_number = 120000 if members_string == 'Around35': members_number = 35 if members_string == 'about5000': members_number = 5000 if members_string == '46nationaluniversitysportsgoverningbodies': members_number = 46 if members_string == 'around1000membersin34branches': members_number = 1000 if members_string == '44associations': members_number = 44 if members_string == 'approx.700': members_number = 700 if members_string == '5,000,000': members_number = 5000000 if members_string == '250associations': members_number = 250 if members_string == '28memberorganisations': members_number = 28 if working_languages: languages = working_languages[0].find_element_by_tag_name( 'strong').text.strip() return founding_year, staff_number, members_number, languages