def get_lineup(regular_elem: webdriver, bench_elem: webdriver) -> (str, str): captain = None vice = None complete_lineup = [] players = regular_elem.find_elements_by_xpath( './/tr[contains(@class, "player-list-item")]') players += bench_elem.find_elements_by_xpath( './/tr[contains(@class, "player-list-item")]')[:-1] for player in players: name = get_player_name(player_elem=player) complete_lineup.append(name) try: player.find_element_by_xpath( './/li[@data-original-title="Capitano"]') captain = name except NoSuchElementException: pass try: player.find_element_by_xpath( './/li[@data-original-title="Vice capitano"]') vice = name except NoSuchElementException: pass captains = f'{captain}, {vice}'.upper() complete_lineup = ', '.join(complete_lineup).upper() return captains, complete_lineup
def check_hmi(proj: webdriver): # 尝试查找页面元素并返回用于判断页面存在的信息 hmi_alive = False hmi_date = '-' hmi_time = '-' check_info = '-' if not proj: # 无法获取目标页面 check_info = 'fail to open the url' log.info(check_info) print(proj) return hmi_alive, hmi_date, hmi_time, check_info else: try: elem = WebDriverWait(proj, 10).until( expected_conditions.presence_of_element_located( (By.XPATH, "/html/body/div"))) if not elem: pass else: hmi_alive = True try: divs = proj.find_elements_by_xpath("/html/body/div") if divs: div_ids = {} div_scrnos = {} span_ids = {} span_texts = {} # div_id = '' s = 1 for i in range(len(divs)): div_ids[i] = divs[i].get_attribute('id') div_scrnos[i] = divs[i].get_attribute('scrno') if div_scrnos[i]: # div_id = div_ids[i] break # print(i, div_ids[i], div_scrnos[i]) s += 1 # print(div_id) spans = proj.find_elements_by_xpath( "/html/body/div[{}]/span".format(s)) for i in range(len(spans)): span_ids[i] = spans[i].get_attribute('id') span_texts[i] = spans[i].text if 'DD' in span_ids[i]: hmi_date = span_texts[i] if 'TIME' in span_ids[i]: hmi_time = span_texts[i] # print(i, span_ids[i], span_texts[i]) except Exception as e: # 查找第一个div元素失败 = HMI未在线 check_info = 'date part not found' # log.error(check_info) except Exception as e: # 无法获得对应HMI的网页,退出当前连接 check_info = 'div not found' # log.error(check_info) return hmi_alive, hmi_date, hmi_time, check_info return hmi_alive, hmi_date, hmi_time, check_info
def get_match_data(brow: webdriver, match_element: webdriver) -> zip: scroll_to_element(brow, match_element) teams = match_element.find_elements_by_xpath( './/h4[@class="media-heading ellipsis"]') schemes = match_element.find_elements_by_xpath('.//h5') first11 = match_element.find_elements_by_xpath( './/table[@id="formationTable"]') reserves = match_element.find_elements_by_xpath( './/table[@id="releaseTable"]') points = match_element.find_elements_by_xpath( './/div[@class="team-main-info"]') time.sleep(1) return zip(teams, schemes, first11, reserves, points)
def regular_or_from_bench(player: webdriver) -> (int, int, int): """ Set info about playing and substitutions for each player. :param player: selenium element :return: tuple, (int, int, int) """ in_out = player.find_elements_by_xpath('.//td//em') attrs = [i.get_attribute('title') for i in in_out] regular = 0 going_in = 0 going_out = 0 if 'Entrato' not in attrs and 'Uscito' not in attrs: regular += 1 elif 'Entrato' in attrs and 'Uscito' not in attrs: going_in += 1 elif 'Entrato' not in attrs and 'Uscito' in attrs: regular += 1 going_out += 1 elif 'Entrato' in attrs and 'Uscito' in attrs: going_in += 1 going_out += 1 return regular, going_in, going_out
def scrape_classifica(brow: webdriver) -> None: """ Scrape real data from website in order to check later how the algorithm is working. """ brow.get(f'{cfg.BASE_URL}classifica') time.sleep(3) dbf.empty_table(table='classifica') positions = brow.find_elements_by_xpath( './/table/tbody/tr[contains(@data-logo, ".png")]') columns = ['team', 'G', 'V', 'N', 'P', 'Gf', 'Gs', 'Dr', 'Pt', 'Tot'] for pos in positions: team_data = [] scroll_to_element(brow, pos) fields = pos.find_elements_by_xpath('.//td')[2:-2] for field in fields: team_data.append(field.text) dbf.db_insert(table='classifica', columns=columns, values=team_data) brow.close()
def find_matches(brow: webdriver) -> (list, bool): # To know if absolute points need to be scraped. It will be False when # scraping lineups of the current day, still incomplete day_is_closed = True # Find all matches matches = brow.find_elements_by_xpath( './/div[contains(@class, "match-details card calculated")]') if not matches: # If day it is not concluded it has a different attribute matches = brow.find_elements_by_xpath( './/div[contains(@class, "match-details")]') day_is_closed = False return matches, day_is_closed
def getalpha(page: webdriver): """ Selection de la liste alphabétique des communes :param page: :return: """ return page.find_elements_by_xpath(dbox + '/tbody/tr[1]/td[2]/p/a')
def get_image_links(driver: webdriver, site: str) -> Links: # On a side note, I am genuinely surprised how they managed to obfuscate the images on each of their sites. # On KissComics, they have all the links in some script located in the page, this script holds the image links # On KissManga, they have img tags that somehow only load the image links once the page is loaded in a browser # viewing the raw HTML leads to the images somehow not being there, I don't know enough WebDev to say how they do it image_links = [] if site == 'comics': regex = re.compile('lstImages.push\\("(.*?)"') for a in driver.find_elements_by_tag_name('script'): img_set = re.findall(regex, a.get_attribute('innerHTML')) if not img_set == []: image_links.append(img_set) elif site == 'manga': elements = driver.find_elements_by_xpath( '//img[@onerror="onErrorImg(this)"]') for elem in elements: src = elem.get_attribute('src') image_links.append(src) print(image_links) return np.array(image_links).flatten().tolist()
def open_panels(brow: webdriver, specific_panel: str = '') -> list: all_panels_path = '//div[@class="item-group ng-scope"]' wait_visible(brow, all_panels_path) all_panels = brow.find_elements_by_xpath(all_panels_path) panel_name_path = './/div[contains(@class, "group-name")]' buttons = [p.find_element_by_xpath(panel_name_path) for p in all_panels] # When playing the bet only the right panel is opened if specific_panel: pairs = [(all_panels[x], buttons[x]) for x in range(len(buttons)) if buttons[x].get_attribute('innerText').strip().lower() == specific_panel] # while when scraping quotes all the valid panels are opened else: pairs = [(all_panels[x], buttons[x]) for x in range(len(buttons)) if buttons[x].get_attribute('innerText').strip().lower() in cfg.PANELS_TO_USE] for _, b in pairs: scroll_to_element(brow, b) panel_name = b.text WebDriverWait(brow, cfg.WAIT).until( EC.element_to_be_clickable((By.LINK_TEXT, panel_name))) if 'active' not in b.get_attribute('class'): b.find_element_by_xpath('.//a').click() time.sleep(1) return [(b.get_attribute('innerText').strip().lower(), p) for p, b in pairs]
def cross_check_teams(table: webdriver, bets_db: list) -> (int, tuple): preds_list = table.find_elements_by_xpath('.//tr[@class="ng-scope"]') teams_web = [] preds_details = [] for pred in preds_list: match = pred.find_element_by_xpath('.//td[6]').text team1, team2 = match.strip().split(' - ') quote = float(pred.find_element_by_xpath('.//td[10]').text) result = pred.find_element_by_xpath('.//td[11]').text label_element = pred.find_element_by_xpath( './/div[contains(@class,"ng-scope")]') label = label_element.get_attribute('ng-switch-when') teams_web.append(team1) teams_web.append(team2) preds_details.append((team1, team2, quote, result, label)) teams_web.sort() for bet_db_id, _ in bets_db: teams_db = dbf.db_select(table='predictions', columns=['team1', 'team2'], where=f'bet_id = {bet_db_id}') teams_db = [t for i in teams_db for t in i] teams_db.sort() if teams_web == teams_db: return bet_db_id, preds_details else: continue return 0, []
def click_sur_fiche_departement_annee(page: webdriver, Niveau): elems = page.find_elements_by_xpath("//a[@href]") for elem in elems: print(Niveau, "elem:", elem.text, elem.get_attribute("href")) if Annee in elem.text: elem.click() break
def extract_all_bets_from_container(bets_container: webdriver) -> [webdriver]: bets_ngclass = "{'active':selection.selected}" all_bets = bets_container.find_elements_by_xpath( f'.//div[@ng-class="{bets_ngclass}"]') return all_bets
def close_all_headers(browser: webdriver) -> webdriver: to_collapse_path = './/div[contains(@class, "collapse")]' to_collapse = browser.find_elements_by_xpath(to_collapse_path) for icon in to_collapse: sf.scroll_to_element(browser, icon) icon.click()
def waitForSearchUpdate(driver: webdriver, wait: float) -> list: "等待搜尋頁面更新完成" # 取出上一頁的最後一個店名作為識別符號(lastShop) # 循環等待一定秒數直到看不見該店名為止,此時應該就更新完成 time.sleep(wait) tbody_elements = driver.find_elements_by_xpath( '//*[@id="inpage"]/div/div/div[2]/div/table/tbody/tr') return tbody_elements
def get_prize(brow: webdriver) -> float: prize_table = ('//div[@class="col-md-5 col-lg-5 col-xs-5 ' + 'pull-right pull-down"]') prize_el = brow.find_elements_by_xpath(prize_table + '//tr/td')[7] prize_value = prize_el.text[:-1].replace('.', '').replace(',', '.') return float(prize_value)
def turnHtmlintoElements(driver: webdriver, soup: BeautifulSoup, tag: str, atri: str, value: str) -> []: return list( map( lambda j: driver.find_elements_by_xpath(j), list( map(lambda obj: xpath_soup(obj), soup.find_all(tag, {atri: value})))))
def get_bet_status(bet: webdriver) -> str: text = bet.find_elements_by_xpath('.//td')[2].text if text == 'Vincente': return 'WINNING' elif text == 'Non Vincente': return 'LOSING' else: return ''
def get_page_review_ids(driver: webdriver): ids: List[str] = [] review_count_on_page: int = len( driver.find_elements_by_xpath(Config.xp_reviews_list)) for i in range(review_count_on_page): review_id = driver.find_element_by_xpath( Config.xp_review_at_index(i + 1)).get_attribute('id') ids.append(review_id) return ids
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) search_url = "https://www.google.com/search?q={q}&source=lnms&tbm=isch" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: for _ in range(10): scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) for img in thumbnail_results[results_start:number_results]: if img.get_attribute('src') and 'http' in img.get_attribute('src'): image_urls.add(img.get_attribute('src')) if img.get_attribute('src') and 'data' in img.get_attribute('src'): image_urls.add(img.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") # return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") time.sleep(3) # end_of_page = wd.find_element_by_xpath("//div[@class='OuJzKb Yu2Dnd']") end_of_page = wd.find_elements_by_xpath( "//*[ contains (text(), 'Looks like') ]") if end_of_page: print("end of the page") break # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def find_all_matches(brow: webdriver, league_name: str) -> [webdriver]: matches_path = './/div[@class="block-event event-description"]' try: wait_clickable(brow, matches_path) except TimeoutException: cfg.LOGGER.info(f'Nessun match trovato per {league_name}.') return [] return brow.find_elements_by_xpath(matches_path)
def all_fields_and_bets(panel: webdriver) -> [(str, webdriver)]: # Select all fields we want to scrape fields_in_db = dbf.db_select(table='fields', columns=['name'], where='') fields_names = [f for i in fields_in_db for f, _ in (i.split('_'), )] fields_names = set(fields_names) all_fields_path = './/div[@class="market-info"]/div' all_bets_path = './/div[@class="market-selections"]' fields = panel.find_elements_by_xpath(all_fields_path) bets = panel.find_elements_by_xpath(all_bets_path) field_bets = [] for field, bet_group in zip(fields, bets): field_name = field.get_attribute('innerText').upper().strip() if field_name in fields_names: field_bets.append((field_name, bet_group)) return field_bets
def extract_urls(self, driver: webdriver, regex: str) -> list: urls = [] elems = driver.find_elements_by_xpath("//a[@href]") for elem in elems: try: url = elem.get_attribute("href") if re.search(rf"{regex}", url): urls.append(url) except: pass return urls
def scrape_components(driver: webdriver): df = pd.DataFrame() for item in COMPONENT_HEADER_DATA: element_lst = WebDriverWait(driver, 10).until(lambda dr: driver.find_elements_by_xpath(item['xpath'])) if item['header'] == 'name': df[item['header']] = [element.get_attribute('title') for element in element_lst] else: df[item['header']] = [element.text for element in element_lst] time_stamp = time.time() + 25200 df['timestamp'] = time_stamp return df
def select_playlist(driver: webdriver, title_list, artist_list, image_list): time.sleep(3) for i in range(12): playlists = driver.find_elements_by_xpath( '//*[@id="container"]/section/div/ul/li') playlists[i].click() select_song(driver, title_list, artist_list, image_list) time.sleep(3) driver.back()
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_xpath('//*[@id="islrg"]/div[1]/div') number_results = len(thumbnail_results) print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_class_name('n3VNCb') for actual_image in actual_images: if actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") sleep(1) load_more_button = wd.find_element_by_class_name("mye4qd") if load_more_button: load_more_button.click() # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def select_style(driver: webdriver): styles = driver.find_elements_by_xpath( '//*[@id="container"]/aside/div/table/tbody/tr[1]/td[1]/ul/li') max_iter = len(styles) for i in range(max_iter): styles = driver.find_elements_by_xpath( '//*[@id="container"]/aside/div/table/tbody/tr[1]/td[1]/ul/li') style_name = str(styles[i].text) styles[i].find_element_by_tag_name('a').send_keys(Keys.ENTER) time.sleep(3) driver.find_element_by_xpath( '//*[@id="container"]/section/div/header/p[2]/a[1]').click() title_list = [] artist_list = [] image_list = [] select_playlist(driver, title_list, artist_list, image_list) data = zip(title_list, artist_list, image_list) data_to_csv(data, style_name, "style")
def filter_by_color(brow: webdriver) -> list: table_path = './/table[@id="tabellaRisultatiTransazioni"]' wait_visible(brow, table_path) bets_list = brow.find_elements_by_xpath(table_path + '//tr[@class="ng-scope"]') color_path = './/td[contains(@class,"state state")]' filtered = [] for bet in bets_list: c = bet.find_element_by_xpath(color_path).get_attribute('class') if 'blue' not in c: filtered.append(bet) return filtered
def set_time_filter(brow: webdriver) -> None: path = ('.//div[@id="movement-filters"]/div[@id="games-filter"]' + '//label[@class="radio-inline"]') wait_visible(brow, path) all_filters = brow.find_elements_by_xpath(path) right_filter = [ f for f in all_filters if f.get_attribute('innerText').strip() == cfg.BETS_FILTER ][0] scroll_to_element(brow, right_filter) right_filter.click() time.sleep(5)
def wrong_day_for_lineups(brow: webdriver, day_to_scrape: int) -> bool: # First check if day in the webpage is the same as the day to scrape real_day_path = './/div[@class="filter-option-inner-inner"]' wait_visible(brow, cfg.WAIT, real_day_path) real_day = brow.find_element_by_xpath(real_day_path) real_day = int(real_day.text.split('°')[0]) if day_to_scrape != real_day: return True # Then check if some lineup is missing hidden_path = './/div[contains(@class, "hidden-formation")]' missing_lineups = brow.find_elements_by_xpath(hidden_path) return True if missing_lineups else False
def search_hotel(driver: webdriver, placename): """ This City name and search it :param driver: :param placename: :return: """ #import pdb;pdb.set_trace() driver.find_element_by_id(SEARCH_HOTEL_INPUT_BOX_ID).send_keys(placename) time.sleep(2) sugget_elements = driver.find_elements_by_xpath(SEARCH_PLACE_XPATH) for element in sugget_elements: if element.text == placename: element.click() break else: continue