def get_tabelog_ranking(driver: webdriver, area: str, keyword: str): result = list() driver.get(TABELOG_URL) driver.set_page_load_timeout(TIMEOUT) sleep(1) driver.find_element_by_xpath(AREA_FORM_XPATH).send_keys(area) driver.find_element_by_xpath(KEYWORD_FORM_XPATH).send_keys(keyword) driver.find_element_by_xpath(SEARCH_BTN_XPATH).click() driver.set_page_load_timeout(TIMEOUT) sleep(1) driver.find_elements_by_class_name(RANKING_BTN_CSS)[0].click() driver.set_page_load_timeout(TIMEOUT) sleep(1) ranking = driver.find_elements_by_class_name(RANKING_LIST_CSS) ranking_star = driver.find_elements_by_class_name(RANKING_STAR_LIST_CSS) for index, shop in enumerate(ranking): if hasattr(shop, "text"): newShop = ShopInfo() newShop.name = shop.text newShop.rank = index + 1 newShop.star = ranking_star[index].text result.append(newShop) return result
def getConCafeData(browser: webdriver, url): try: # URLにアクセス browser.get(url) sleep(3) # エリアのリンク一覧を配列で取得 AreaList = browser.find_elements_by_class_name("f-found_link")[0] for Area in AreaList: Area[0].click() sleep(3) for Shop in browser.find_elements_by_class_name("free_shop"): print(shop.find_elements_by_class_name("shop_name ellipsis")) sleep(3) # データの取得 articleElements = browser.find_elements_by_class_name("data") contactAddress = articleElements[3].text updateDate = articleElements[2].text return [contactAddress, updateDate] except Exception as e: return e
def login(driver: webdriver): login_url = "https://pvoutput.org/login.jsp" driver.get(login_url) driver.implicitly_wait(100) username = driver.find_element_by_id("login") password = driver.find_element_by_id("password") username.send_keys(USERNAME) password.send_keys(PASSWORD) driver.find_elements_by_class_name("btn-primary")[0].click()
def parse_post_data(driver: webdriver, url): """Собирает данные о лайках и комментах к посту""" driver.get(url) try: likes = int( driver.find_elements_by_class_name( 'sqdOP')[2].find_element_by_tag_name('span').text.replace( ' ', '')) # sqdOP yWX7d _8A5w5 vcOH2 views = 0 except NoSuchElementException: print('это видео') try: button = driver.find_element_by_class_name('vcOH2') views = int( button.find_element_by_tag_name('span').text.replace(' ', '')) button.click() likes = int( driver.find_element_by_class_name( 'vJRqr').find_element_by_tag_name('span').text.replace( ' ', '')) button = driver.find_element_by_class_name('QhbhU') button.click() except NoSuchElementException: views = 0 try: likes = int( driver.find_element_by_class_name( 'vJRqr').find_element_by_tag_name('span').text.replace( ' ', '')) except NoSuchElementException: likes = 0 print(f'Нихуя не нашло у поста: {url}') comments = get_comments_count(driver=driver) return likes, views, comments
def handle_room_tag(driver: webdriver, comm: str): """ 进入每一个直播, 并插播广告 写入日志 """ # 将所有标签统计出来 driver.execute_script(Order.page_end.value) sum_room = driver.find_elements_by_class_name(Order.room_tag.value) i = 0 while i < len(sum_room): try: _into_room_handle(driver, comm, i) except ElementClickInterceptedException: move_down(driver) _into_room_handle(driver, comm, i) i += 1 try: move_down(driver) tag = driver.find_element_by_css_selector(Order.page_down.value) if tag: time.sleep(1) tag.click() driver.implicitly_wait(5) handle_room_tag(driver, comm) except NoSuchElementException: print("finish") driver.close() return
def setup(driver: webdriver) -> None: """Set up the web page that's to be scraped.""" # Find and click buttons to activate chat window. driver.get(URL) button = driver.find_elements_by_class_name('pb-quickReply')[1] button.click() button = driver.find_elements_by_class_name('pb-quickReply')[2] button.click() button = driver.find_elements_by_class_name('pb-quickReply')[5] button.click() # Wait until input form is visible. WebDriverWait(driver, 10).until( ec.visibility_of_all_elements_located((By.ID, 'main-input')))
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_xpath('//*[@id="islrg"]/div[1]/div') number_results = len(thumbnail_results) print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_class_name('n3VNCb') for actual_image in actual_images: if actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") sleep(1) load_more_button = wd.find_element_by_class_name("mye4qd") if load_more_button: load_more_button.click() # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def make_reservation(self, sportjaDriver: webdriver): self.sportjaDriver = sportjaDriver _check_if_next_week(self) classnames = sportjaDriver.find_elements_by_class_name("classname") weightList = [] for classname in classnames: if classname.text == "Weightlifting": weightList.append(classname) for elem in weightList[1:]: elem.click() sleep(1) reserveButton = None waitingListButton = None cancelButton = None try: cancelButton = sportjaDriver.find_element_by_css_selector( "a.grey_btn_small:nth-child(2) > span:nth-child(1)") except NoSuchElementException: print("No cancelButton found.") try: reserveButton = sportjaDriver.find_element_by_css_selector( "#book_btn > span:nth-child(1)") except NoSuchElementException: print("No reserveButton found.") try: waitingListButton = sportjaDriver.find_element_by_css_selector( "#join_waiting_list_btn") except NoSuchElementException: print("No waitingListButton found.") if reserveButton: reserveButton.click() sportjaDriver.close() print('Class reserved for next Saturday!') return 0 elif waitingListButton: waitingListButton.click() sportjaDriver.close() print('Put on waiting list, check your mailbox') return 0 elif cancelButton: sportjaDriver.close() print('Already reserved') return 0 else: print('Can\'t reserve or put on waiting list') sportjaDriver.close() return 1
def get_posts_list_and_subs(driver: webdriver, url: str, months: list): """Прогружает страницу пользователя и возвращает список из большинства последних постов""" driver.get(url) sleep(2) try: subs = int( driver.find_elements_by_class_name('g47SY')[1].get_attribute( 'title').replace(' ', '')) print(f'{subs} подписчиков') except IndexError: subs = 0 print(f'У аккаунта {url} не получилось собрать количество подписчиков') driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') sleep(2) posts = driver.find_elements_by_class_name('v1Nh3') posts_links = [] for post in posts: posts_links.append( post.find_element_by_tag_name('a').get_attribute('href')) try: more_button = driver.find_element_by_class_name('xLCgt') more_button.click() except NoSuchElementException: pass except ElementClickInterceptedException: print('кнопка не нажимается') for _ in range(5): driver.execute_script( 'window.scrollTo(0, document.body.scrollHeight);') sleep(2) posts = driver.find_elements_by_class_name('v1Nh3') for post in posts: posts_links.append( post.find_element_by_tag_name('a').get_attribute('href')) posts_links = delete_dublicates_from_list(posts_links) res = filter_posts_list(driver=driver, months=months, posts_list=posts_links) return res, subs
def get_atms(driver: webdriver, city_name: str, region_name: str): """Проходим по всем страницам, и получаем список банкоматов. :driver: selenium.webdriver :returns: [{ ... }] """ pages = 0 rows = list() has_next_page = True while has_next_page: pages = pages + 1 for row in driver.find_elements_by_class_name('page-atm__table_row'): rows.append({ 'region': region_name, 'city': city_name, 'bank': row.find_element_by_class_name('address-logo').text, 'address_title': row.find_element_by_class_name('address-title').text, 'address_type': row.find_element_by_class_name('address-type').text, 'working_time': row.find_element_by_class_name( 'page-atm__table_col--time').text, 'currency': row.find_element_by_class_name( 'page-atm__table_col--currency').text, 'address_metro': row.find_element_by_class_name('address-metro').text, 'address_map': (row.find_element_by_class_name( 'address-map').find_element_by_link_text( 'Показать на карте').get_attribute("href")) }) try: sleep(randint(1, 3)) driver.find_element_by_class_name('pagination-arrow--next').click() except NoSuchElementException: has_next_page = False logging.info('{}: {} (pages: {}; atms: {})'.format(region_name, city_name, pages, len(rows))) return rows
def getAccountsFromList(browser: webdriver, list_name): """ Twitterでリストのフォロワーを抽出する :param browser: webdriver """ # リストにアクセスする browser.get('https://twitter.com/' + USER_NAME + '/lists/' + list_name + '/members') accounts = browser.find_elements_by_class_name("js-actionable-user") account_list = [] for account in accounts: name = account.get_attribute('data-screen-name') if name != None: account_list.append(name) return account_list
def empty_cart(driver: webdriver): try: WebDriverWait(driver, 3).until( expected_conditions.presence_of_element_located( (By.CLASS_NAME, 'footer'))) remove_buttons = driver.find_elements_by_class_name('cart_button') for remove_button in remove_buttons: inventory_item_name = remove_button.find_element_by_xpath( '../../a/div').text remove_button.click() print( str(datetime.datetime.now()) + ' Item "' + inventory_item_name + '" removed from cart.') print(str(datetime.datetime.now()) + ' empty_cart PASSED') except: traceback.print_exc() raise
def scrape_popularity_changes(self, driver: webdriver) -> List[ChangeStock]: WebDriverWait(driver, 3).until( EC.presence_of_element_located( (By.CLASS_NAME, constants.CLASS_USER_CHANGE))) stocks = driver.find_elements_by_class_name( constants.CLASS_USER_CHANGE) change_stocks = [] for i in range(0, len(stocks), 5): rank = stocks[i].text symbol = stocks[i + constants.SYMBOL_DIFF].text change = stocks[i + constants.CHANGE_DIFF].text prev_day = stocks[i + constants.USERS_PREV_DAY_DIFF].text cur_day = stocks[i + constants.USERS_CUR_DAY_DIFF].text change_stock = ChangeStock(rank, symbol, change, prev_day, cur_day) change_stocks.append(change_stock) return change_stocks
def fill_cart(driver: webdriver): try: WebDriverWait(driver, 3).until( expected_conditions.presence_of_element_located( (By.CLASS_NAME, 'footer'))) products = {} inventory_item_names = driver.find_elements_by_class_name( 'inventory_item_name') for inventory_item_name in inventory_item_names: add_to_cart_button = inventory_item_name.find_element_by_xpath( '../../../div[@class="pricebar"]/button') products[inventory_item_name.text] = add_to_cart_button for product, button in products.items(): button.click() print( str(datetime.datetime.now()) + ' "' + product + '" added to cart.') print(str(datetime.datetime.now()) + ' fill_cart PASSED') except: traceback.print_exc() raise
def sign_in(driver: webdriver) -> None: username = env("username") password = env("password") driver.get(URL) sleep(1.5) # go to log in by mousehunt account driver.find_elements_by_class_name("signInText")[0].click() # Enter credentials user_field = driver.find_elements_by_class_name("username")[3] user_field.send_keys(username) password_field = driver.find_elements_by_class_name("password")[3] sleep(1.5) password_field.send_keys(password) # Click Login driver.find_elements_by_class_name("actionButton")[1].click() sleep(1.5)
def get_vineyards(link: str, driver: webdriver, destination: str, date: date): driver.get(link) # Select category 'Crops' product_selector = driver.find_element_by_id('filter_3_primary') product_selector.click() product_selector.send_keys(Keys.ARROW_DOWN) product_selector.send_keys(Keys.ENTER) sleep(0.3) print('Loading category "Crops" successful.') # Select subcategory 'Fruit crop_selector = driver.find_element_by_id('filter_3_secondary') crop_selector.click() crop_selector.send_keys(Keys.ARROW_DOWN) crop_selector.send_keys(Keys.ARROW_DOWN) crop_selector.send_keys(Keys.ENTER) sleep(0.3) print('Loading subcategory "Fruit" successful.') # Select 'Grapes For Wine' fruit_selector = driver.find_element_by_id('filter_3_tertiary') fruit_selector.click() for i in range(19): sleep(0.1) fruit_selector.send_keys(Keys.ARROW_DOWN) fruit_selector.send_keys(Keys.ENTER) print('Loading "Grapes For Wine" successful.') # Load all producers load_button = driver.find_element_by_id('scrollDown') n_elements = len(driver.find_elements_by_class_name('results_list_item')) while True: print('Loading more vineyards.') for i in range(20): load_button.click() sleep(0.1) n_elements_new = len( driver.find_elements_by_class_name('results_list_item')) if n_elements < n_elements_new: n_elements = n_elements_new else: print('Loaded all vineyards.') break # Create .csv with open(destination, 'w') as output: writer = csv.writer(output) writer.writerow([ 'Name', 'Category', 'Address', 'Phone', 'Email', 'Website', 'Short description', 'Description', 'Crops', 'Processed products', 'Cropped_acreage', 'Total_acreage' ]) # Prepare spider selector = Selector(text=driver.page_source) links = [ 'http://www.biodynamicfood.org' + link for link in selector.xpath( '//*[@class="results_list_item"]//a/@href').extract() ] class MySpider(Spider): name = 'biodynamic' allowed_domains = ['http://www.biodynamicfood.org/'] start_urls = links def parse(self, response): sel = Selector(response) Name = sel.xpath('//h1/text()').extract_first() Category = sel.xpath( '//h2[@class="business-type"]/text()').extract_first() address_field_1 = sel.xpath( '//div[@class="member-address"]/p/text()[1]').extract_first( ).strip() address_field_2 = sel.xpath( '//div[@class="member-address"]/p/text()[2]').extract_first( ).strip() Address = address_field_1 + '\n' + address_field_2 contact_info = sel.xpath( '//div[@class="member-address"]/p/text()').extract() contact_info = [line.strip() for line in contact_info] Phone = [ line for line in contact_info if line.startswith('Phone: ') ][0] Phone = Phone.replace('Phone: ', '') Email = sel.xpath( '//div[@class="member-address"]//a[1]/text()').extract_first() Website = sel.xpath( '//div[@class="member-address"]//a[2]/text()').extract_first() Short_description = sel.xpath( '//p[@class="quote"]/text()').extract_first() profile = sel.xpath( '//div[@class="member-profile"]/div/p/text()').extract() profile = [element.strip() for element in profile] len_Description = max([len(element) for element in profile]) Description = [ element for element in profile if len(element) == len_Description ][0] Crops = sel.xpath( '//div[p/*/text()="Crops"]//li//text()').extract() Crops = ', '.join(Crops) Processed_products = sel.xpath( '//div[p/*/text()="Processed Product"]//li//text()').extract() Processed_products = ', '.join(Processed_products) all_text = sel.xpath('//p/text()').extract() all_text = [text.strip() for text in all_text] Acreage = [text for text in all_text if 'Acres' in text] try: Cropped_acreage = Acreage[0] Total_acreage = Acreage[1] except IndexError: print('Acreage not specified for one organization.') Cropped_acreage = '' Total_acreage = '' with open(destination, 'a', newline='') as output: writer = csv.writer(output) writer.writerow([ Name, Category, Address, Phone, Email, Website, Short_description, Description, Crops, Processed_products, Cropped_acreage, Total_acreage ]) # Run spider process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(MySpider) process.start() process.stop()
def download_chapter(driver: webdriver, chapter_url: str): driver.get(chapter_url) # Open chapter settings = WebDriverWait(driver, 10).until( EC.presence_of_element_located(( By.XPATH, "//div[starts-with(@class, 'Navigation-module_settingsContainer_')]" ))) # Find settings button with wait settings.click() # Open settings driver.find_elements_by_xpath( "//div[starts-with(@class, 'Modal-module_quarity_')]")[2].click( ) # Set image quality settings.click() # Open settings again driver.execute_script( 'document.querySelector("input#mode-horizontal").removeAttribute("disabled")' ) # Endble horizontal if it disabled driver.execute_script( 'document.querySelector("input#mode-horizontal").click()' ) # Turn into horizontal mode time.sleep(3) title = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.XPATH, '/html/body/div/div[2]/div[2]/div[3]/div[1]/div[2]/a/h1' ))).text # Wait for page load after refresh chapter = driver.find_element_by_xpath( '/html/body/div/div[2]/div[2]/div[3]/div[1]/div[2]/div/p').text[1:] title = title.replace(':', '') # load all images pages = int( driver.find_element_by_xpath( '/html/body/div/div[2]/div[2]/div[2]/div[2]/p').text.split(' / ') [1]) actions = ActionChains(driver) while True: actions.send_keys(Keys.LEFT).perform() time.sleep(1) if driver.find_element_by_xpath('/html/body/div/div[2]/div[2]/div[2]/div[2]/p').text == f'{pages - 1} / {pages}' or\ driver.find_element_by_xpath('/html/body/div/div[2]/div[2]/div[2]/div[2]/p').text == f'{pages} / {pages}': break os.makedirs(os.path.dirname(f'.//{title}//{chapter}//'), exist_ok=True) for page, img in enumerate(driver.find_elements_by_class_name('zao-image'), start=1): b64 = driver.execute_script( '''function getBase64Image(img) { var canvas = document.createElement("canvas"); canvas.width = img.naturalWidth; canvas.height = img.naturalHeight; var ctx = canvas.getContext("2d"); ctx.drawImage(img, 0, 0); var dataURL = canvas.toDataURL(); return dataURL.replace(/^data:image\/(png|jpg);base64,/, ""); } return getBase64Image(arguments[0]) ''', img) image_data = base64.b64decode(b64) with open(f'.//{title}//{chapter}//{page}.png', 'wb') as file: file.write(image_data) driver.close()
def extract(self, df: pd.DataFrame, driver: webdriver): # df2 = pd.DataFrame(columns=['first', 'last', 'profile', 'email', 'occupation', 'company', 'phone']) result = [] total_rows = len(df) control_count = 0 try: for index, r in df.iterrows(): if (control_count % 100 == 0) and (control_count > 0): driver = self.perform_login() control_count = 0 control_count += 1 found_email = None found_job = None found_company = None found_phone = None found_location = None if pd.isnull(r['profileUrl']): continue first_name = r['firstName'] last_name = r['lastName'] profile_url = r['profileUrl'] # driver.execute_script("window.open('', '_BLANK')") # driver.switch_to.window(driver.window_handles[1]) driver.get(profile_url + '/detail/contact-info') not_found = driver.current_url == 'https://www.linkedin.com/in/unavailable/' # noinspection PyBroadException try: email = driver.find_element_by_xpath( "//a[contains(@href, 'mailto')]") if email is not None: found_email = email.text except: found_email = None try: occupation = driver.find_element_by_xpath( "//h2[contains(@class, 'mt1 t-18 t-black t-normal break-words')]" ) if occupation is not None: found_job = occupation.text except: found_job = None try: company_name = driver.find_element_by_xpath( "//span[contains(@class, 'text-align-left ml2 t-14 t-black t-bold full-width lt-line-clamp lt-line-clamp--multi-line ember-view')]" ) if company_name is not None: found_company = company_name.text except: found_company = None try: phone_number = driver.find_element_by_xpath( "//li[contains(@class, 'pv-contact-info__ci-container t-14')]/span[contains(@class, 't-14 t-black t-normal')]" ) if phone_number is not None: found_phone = phone_number.text except: found_phone = None try: e1 = driver.find_elements_by_class_name("ph5") e2 = e1[0].find_elements_by_class_name( "pv-top-card--list-bullet") e3 = e2[0].find_element_by_class_name("t-16") found_location = e3.text except: found_location = None if (found_phone is None) and (found_company is None) \ and (found_email is None) and (found_job is None) \ and (found_location is None): if not not_found: breakpoint() driver = self.perform_login() result.append({ 'first': first_name, 'last': last_name, 'profile': profile_url, 'location': found_location, 'email': found_email, 'company': found_company, 'occupation': found_job, 'phone': found_phone, }) found_not_found = 'NOT FOUND' if not_found else 'Found' if not_found: print( f'NOT FOUND: {first_name} {last_name} - {profile_url}') else: print( f'Found and added: {first_name} {last_name} {found_email} / {found_location} ({index}/{total_rows})' ) # time.sleep(10) finally: return pd.DataFrame(result)
def courseDetails(driver: webdriver, url: str): driver.get(url) #time.sleep(7) l = WebDriverWait(driver, timeout=10).until( expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, "div[jsname='rymPhb']"))) # while not WebDriverWait(driver, timeout=10).until(expected_conditions.element_to_be_clickable(l.find_element_by_tag_name("div"))): # time.sleep(2) check_height = driver.execute_script("return document.body.scrollHeight;") while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(4) height = driver.execute_script("return document.body.scrollHeight;") if height == check_height: break check_height = height pg = BeautifulSoup(driver.page_source, "lxml") temp = pg.find(name="div", attrs={"jsname": "rymPhb"}) assign, material, text = [], [], [] for i in range(len(temp.contents)): if len(temp.contents[i]["class"]) > 4: j = temp.contents[i].contents[0].contents[0].contents text = j[2].contents[0].contents[1].contents[0].string.split( ": ")[1] date = j[2].contents[1].contents[1].string if j[0]["aria-label"][0] == "A": assign.append([i, text, date]) else: material.append([i, text, date]) time.sleep(3) total = driver.find_elements(By.CSS_SELECTOR, "div[jsname='rymPhb'] > div") for i in assign: #WebDriverWait(driver, timeout = 5).until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, "div[jsname='rymPhb']"))) driver.execute_script("arguments[0].click();", total[i[0]]) #total[i[0]].click() time.sleep(3) assignment = BeautifulSoup(driver.page_source, "lxml") WebDriverWait(driver, timeout=6).until( expected_conditions.visibility_of( driver.find_elements_by_class_name("W4hhKd")[-1])) details = assignment.find_all(name="div", attrs={"class": "W4hhKd" })[-1].contents #make DOM visible i.append(details[1].string) #due date if i[-1] is None: i[-1] = "No due date" #i.append(details[0].contents[0].contents[0].string) #max marks if len(details[0].contents) == 0: i.append("No marks mentioned") i.append("No marks received") else: temp = details[0].contents[0].contents[0].contents if len(temp) > 1: i.append(temp[1].string.split()[-1]) i.append(temp[1].string.split()[0]) else: i.append(temp[0]) i.append("No marks received") details = assignment.find_all(name="aside", attrs={ "class": "asCVDb" })[-1].contents[0].contents[0].contents[1].contents[0] if details.contents[0].string[0] == 'A': i.append(details.contents[0].string + " (Not submitted)") else: i.append(details.contents[0].string) i[0] = driver.current_url driver.find_element_by_tag_name("nav").find_element_by_tag_name( "div").find_element_by_tag_name("div").find_element_by_tag_name( "div").find_element_by_tag_name("h1").find_element_by_tag_name( "a").click() time.sleep(2) for i in material: total[i[0]].click() time.sleep(2) i[0] = driver.current_url driver.find_element_by_tag_name("nav").find_element_by_tag_name( "div").find_element_by_tag_name("div").find_element_by_tag_name( "div").find_element_by_tag_name("h1").find_element_by_tag_name( "a").click() time.sleep(2) return assign, material
def get_related_search_words(driver: webdriver): related_search_words = driver.find_elements_by_class_name("nVcaUb") for i in range(len(related_search_words)): print(related_search_words[i].text) print(type(related_search_words[i]))