urls_df = all_urls_df.drop([0,1]) urls_df = urls_df[:5] app_path = "/usr/local/bin/chromedriver" #pc_app_path = chrome_options = Options() chrome_options.add_argument("--headless") driver = Chrome(executable_path = app_path, chrome_options=chrome_options) for url in urls_df['recipe-url']: driver.get(url) time.sleep(2) # Scrape the title of the recipe recipe_title = driver.find_element_by_tag_name('h1').text # Scrape the creator-provided description descrip = driver.find_element_by_class_name('print-only recipe-layout__description').text # Scrape the recipe author author_name = driver.find_element_by_class_name('recipe-details__author-link theme-color').text author_url = driver.find_element_by_class_name('recipe-details__author') need href # Scrape the star rating from the style and the number of reviews from the # text star_rating = driver.find_element_by_class_name("stars-rate__filler") rating_val = star_rating.get_attribute("style")
import selenium from selenium import webdriver from selenium.webdriver import Chrome from selenium.webdriver.common.keys import Keys import time from selenium.webdriver.common.action_chains import ActionChains senhaarqui="suasenha" loginarquias="seucpf" browser = Chrome() browser.get("https://www.athena.biblioteca.unesp.br/F/RR61ETIMN453GKPD568TA7PDXE6NR6T665RJLJ6XB8FG34BTGV-43921?func=BOR-INFO") time.sleep(5) for a in range(0,17): browser.find_element_by_tag_name("body").send_keys(Keys.TAB) time.sleep(5) actions = ActionChains(browser) actions.key_down(Keys.ENTER) actions.perform() time.sleep(5) browser.find_element_by_id("pat_id").send_keys(loginarquias) browser.find_element_by_id("pat_password").send_keys(senhaarqui) browser.find_element_by_id("pat_password").send_keys(Keys.ENTER) time.sleep(5) for a in range(0,17): browser.find_element_by_tag_name("body").send_keys(Keys.TAB)
class NewVisitorTest(LiveServerTestCase): def setUp(self): options = ChromeOptions() options.add_argument("--no-sandbox") self.browser = Chrome(options=options) def TearDown(self): self.browser.quit() def wait_for_row_in_list_table(self, row_text): start_time = time.time() while True: try: table = self.browser.find_element_by_id('id_list_table') rows = table.find_elements_by_tag_name('tr') self.assertIn(row_text, [row.text for row in rows]) return except (AssertionError, WebDriverException) as e: if time.time() - start_time > MAX_WAIT: raise e time.sleep(0.5) def test_can_start_a_list_for_one_user(self): # Edith has heard about a cool new online to-do app. She goes # to check out its homepage self.browser.get(self.live_server_url) # She notices the page title and header mention to-do lists self.assertIn('To-Do', self.browser.title) header_text = self.browser.find_element_by_tag_name('h1').text self.assertIn('To-Do', header_text) # She is invited to enter a to-do item straight away inputbox = self.browser.find_element_by_id('id_new_item') self.assertEqual(inputbox.get_attribute('placeholder'), 'Enter a to-do item') # She types "Buy peacock feathers" into a text box (Edith's hobby # is tying fly-fishing lures) inputbox.send_keys('Buy peacock feathers') # When she hits enter, the page updates, and now the page lists # "1: Buy peacock feathers" as an item in a to-do list table inputbox.send_keys(Keys.ENTER) self.wait_for_row_in_list_table('1: Buy peacock feathers') # There is still a text box inviting her to add another item. She # enters "Use peacock feathers to make a fly" (Edith is very # methodical) inputbox = self.browser.find_element_by_id('id_new_item') inputbox.send_keys('Use peacock feathers to make a fly') inputbox.send_keys(Keys.ENTER) # The page updates again, and now shows both items on her list self.wait_for_row_in_list_table( '2: Use peacock feathers to make a fly') self.wait_for_row_in_list_table('1: Buy peacock feathers') # Satisfied, she goes back to sleep def test_multiple_users_can_start_lists_at_different_urls(self): # Edith starts a new to-do list self.browser.get(self.live_server_url) inputbox = self.browser.find_element_by_id('id_new_item') inputbox.send_keys('Buy peacock feathers') inputbox.send_keys(Keys.ENTER) self.wait_for_row_in_list_table('1: Buy peacock feathers') # She notices that her list has a unique URL edith_list_url = self.browser.current_url self.assertRegex(edith_list_url, '/lists/.+') # Now a new user, Francis, comes along to the site. ## We use a new browser session to make sure that no information ## of Edith's is coming through from cookies etc self.browser.quit() options = ChromeOptions() options.add_argument("--no-sandbox") self.browser = Chrome(options=options) # Francis visits the home page. There is no sign of Edith's # list self.browser.get(self.live_server_url) page_text = self.browser.find_element_by_tag_name('body').text self.assertNotIn('Buy peacock feathers', page_text) self.assertNotIn('make a fly', page_text) # Francis starts a new list by entering a new item. He # is less interesting than Edith... inputbox = self.browser.find_element_by_id('id_new_item') inputbox.send_keys('Buy milk') inputbox.send_keys(Keys.ENTER) self.wait_for_row_in_list_table('1: Buy milk') # Francis gets his own unique URL francis_list_url = self.browser.current_url self.assertRegex(francis_list_url, '/lists/.+') self.assertNotEqual(francis_list_url, edith_list_url) # Again, there is no trace of Edith's list page_text = self.browser.find_element_by_tag_name('body').text self.assertNotIn('Buy peacock feathers', page_text) self.assertIn('Buy milk', page_text) # Satisfied, they both go back to sleep def test_layout_and_styling(self): # Edith goes to the home page self.browser.get(self.live_server_url) self.browser.set_window_size(1024, 768) # She notices the input box is nicely centered inputbox = self.browser.find_element_by_id('id_new_item') self.assertAlmostEqual(inputbox.location['x'] + inputbox.size['width'] / 2, 512, delta=10)
# with open('./kbo_link.json','wt') as f: # json.dump(kbo_link,f) ### 유투브 댓글 크롤링 ### kbo_bonki = [] for li in kbo_commentlink: delay = 2 browser = Chrome() browser.implicitly_wait(delay) start_url = "https://www.youtube.com" + li browser.get(start_url) browser.maximize_window() print(start_url) time.sleep(3) body = browser.find_element_by_tag_name('body') pagedowns = 2 #2번 밑으로 while pagedowns: body.send_keys(Keys.PAGE_DOWN) time.sleep(2) pagedowns -= 1 time.sleep(3) print('@@@@@@@여기까지 정상') html0 = browser.page_source soup = BeautifulSoup(html0, 'lxml') comment_list = soup.find_all('yt-formatted-string', id='content-text', limit=5)
dados = { 'nome': 'Wagner', 'email': '*****@*****.**', 'senha': '123456', 'telefone': '(00)00000-0000', } dict_elementos = {'%40': '@', '%28': '(', '%29': ')'} dict_url = {} preencher_formulario(chrome, **dados) sleep(5) url_parseada = urlparse(chrome.current_url) list_query = url_parseada.query.split('&') for texto in list_query: atributo, valor = texto.split('=') if atributo != 'btn': dict_url[atributo] = valor for cod, decod in dict_elementos.items(): for chave, valor in dict_url.items(): dict_url[chave] = valor.replace(cod, decod) textarea = chrome.find_element_by_tag_name('textarea') dict_text = json.loads(textarea.text.replace('\'', '\"')) assert dict_text == dict_url
from selenium.webdriver import Chrome from time import sleep url = 'https://curso-python-selenium.netlify.app/aula_03.html' browser = Chrome() browser.get(url) sleep(3) a = browser.find_element_by_tag_name('a') for click in range(10): p = browser.find_elements_by_tag_name('p') a.click() print(f'Valor de p: {p[-1].text} valor do clock: {click}') print(f'Valor de p: {p[-1].text == str(click)}') print(f'Texto de A: {a.text}') #browser.quit()
class InstaBot(object): base_url = 'https://www.instagram.com' def __init__(self, implicit_wait=20, page_load_timeout=30): try: Xvfb().start() except EnvironmentError: pass options = ChromeOptions() options.add_argument('--no-sandbox') options.add_argument('--disable-setuid-sandbox') self.driver = Chrome(settings.CHROMEDRIVER_PATH, chrome_options=options) self.driver.implicitly_wait(implicit_wait) self.driver.set_page_load_timeout(page_load_timeout) self.wait = WebDriverWait(self.driver, settings.WEB_DRIVER_WAIT_SEC) self.liked = 0 self.liked_total_samples = 0 self.followed = 0 def close(self): try: self.driver.delete_all_cookies() self.driver.close() from subprocess import call call(['killall', 'Xvfb']) call(['killall', 'chromedriver']) except: pass def login(self, username=None, password=None): username = username or os.environ.get('INSTABOT_IG_USERNAME') password = password or os.environ.get('INSTABOT_IG_PASSWORD') if not username or not password: raise InvalidUsernamePasswordError logger.info("Logging in as: %s" % username) self.driver.get(self.base_url) self.wait.until(EC.element_to_be_clickable( (By.XPATH, xpath.login))).click() self.driver.find_element_by_name('username').send_keys(username) self.driver.find_element_by_name('password').send_keys(password) self.driver.find_element_by_xpath(xpath.submit_login).click() def follow_users(self, usernames=None): """ Follow all the users (don't pass `@') """ for username in usernames: time.sleep(settings.FOLLOW_USER_SLEEP_SEC) self.driver.get('%s/%s' % (self.base_url, username)) try: elem = self.wait.until( EC.element_to_be_clickable((By.XPATH, xpath.follow))) if elem.text.lower() != 'following': elem.click() self.followed += 1 logger.info("Started following %s" % username) else: logger.info("Already following %s" % username) except NoSuchElementException as e: logger.info(e) except Exception as e: logger.error(e) def like_tags(self, tags, num=100): """ Like `num' number of posts when exploring hashtag (don't pass `#') A random sample of posts will be liked for a given tag Return the usernames of the posts liked """ usernames = [] for tag in tags: time.sleep(settings.LIKE_TAG_SLEEP_SEC) logger.info("Liking posts with tag: %s" % tag) self.driver.get('%s/explore/tags/%s/' % (self.base_url, tag)) time.sleep(settings.LIKE_TAG_SLEEP_SEC) self._load_more(max(1, num / 10)) # get the actual url's of images to like try: main = self.driver.find_element_by_tag_name('main') except NoSuchElementException as e: logger.info(e) continue links = main.find_elements_by_tag_name('a') urls = [link.get_attribute('href') for link in links] sample = random.sample(urls, min(num, len(links))) self.liked_total_samples += len(sample) logger.info("Like sample size: %d" % len(sample)) for url in sample: time.sleep(settings.LIKE_TAG_SLEEP_SEC) try: self.driver.get(url) elem = self.driver.find_element_by_link_text('Like') username = self.driver.find_element_by_xpath( xpath.profile_username).text elem.click() self.liked += 1 usernames.append(username) except NoSuchElementException as e: logger.info(e) logger.info("Liked %d/%d" % (self.liked, self.liked_total_samples)) return usernames def _load_more(self, n=10): """ Press "end" key `n' times to load more images """ body = self.driver.find_element_by_tag_name('body') for _ in range(n): body.send_keys(Keys.END) time.sleep(settings.LOAD_MORE_SLEEP_SEC)
import requests from bs4 import BeautifulSoup from time import sleep import csv from selenium import webdriver from selenium.webdriver import Chrome from selenium.webdriver.common.keys import Keys url = "https://www.youtube.com/user/whatsg/videos" chromedriver = '/Users/mclaren/Downloads/CodingFor/chromedriver' driver = Chrome(chromedriver) driver.get(url) html = driver.find_element_by_tag_name('html') # title = driver.find_elements_by_id('video-title') html.send_keys(Keys.END) # print("-------------sleeping-----------------") sleep(3) html.send_keys(Keys.END) # print("-------------sleeping-----------------") sleep(3) title = driver.find_elements_by_id('video-title') titles = [i.text for i in title] print(titles) print("----------------------Number of titles {} ---------------------------". format(len(titles))) driver.close()
class Scraper: """ Class for an OkCupid Scraper Attributes: name (str): alias for the account that will be used to access OKC for scraping driver (WebDriver): tool used to get and navigate web pages mongoclient (pymongo.mongo_client.MongoClient): mongo database client to store data email (str): email of the scraper account pw (str): password of the scraper account version (str): date string of the datetime when current version was completed. """ def __init__(self, name, headless = True, driverpath=f'{os.getcwd()}/src/chromedriver'): """ Constructor for the Scraper class Parameters: name (str): alias for the account that will be used to access okc for scraping driverpath (str): path to the web driver file """ self.name = name opt = Options() opt.headless = headless self.driver = Chrome(executable_path=driverpath, options=opt) self.db = MongoClient('localhost', 27017).okc #get email and password from file user = pd.read_csv('src/okc_account_credentials', index_col=0).loc[name] self.email = user.email self.pw = user.pw #fetch current version record = self.db.scrapers.find_one({'_id': name}) if record != None: self.version = record['current_version'] else: self.version = None def login(self): """ Logs in to the scraper's account """ self.driver.get('https://www.okcupid.com/login') time.sleep(2) self.driver.find_element_by_class_name('accept-cookies-button')\ .click() time.sleep(1) try: self.driver.find_element_by_class_name('login-username')\ .send_keys(self.email) self.driver.find_element_by_class_name('login-password')\ .send_keys(self.pw) self.driver.find_element_by_class_name('login-actions-button')\ .click() #sometimes it's a different login form except NoSuchElementException: self.driver.find_element_by_name('username')\ .send_keys(self.email) self.driver.find_element_by_name('password')\ .send_keys(self.pw) self.driver.find_element_by_class_name('login2017-actions-button')\ .click() time.sleep(2) def logout(self): """ Logs the current scraper out. """ self.driver.get('https://www.okcupid.com/logout') self.driver.close() def set_first_version(self, question_data): #TODO docstring #qd = self.getScraperQuestionData() dt_now = datetime.now().strftime('%Y%m%d_%H%M%S') self.db.scrapers.insert_one({ \ '_id': self.name, \ 'current_version': dt_now, \ 'versions':{ \ dt_now: question_data \ }}) self.version = dt_now def add_questions_update_version(self, new_question_data): #TODO docstring dt_now = datetime.now().strftime('%Y%m%d_%H%M%S') versions = self.db.scrapers.find_one({'_id':self.name})['versions'] prev_version = versions[self.version] versions[dt_now] = Scraper._merge_question_data_versions(prev_version, new_question_data) self.db.scrapers.update({'_id':self.name}, {'$set': {'versions': versions, 'current_version': dt_now}}) self.version = dt_now def _merge_question_data_versions(prev_qd, new_qd): ''' Returns a complete question-data-list that is the union of the two lists, where old versions of the same questions are replaced with new versions. ''' ret_dict = {text:question for text, question in map( lambda q: (q['q_text'], q), new_qd)} for question in prev_qd: text = question['q_text'] if ret_dict.get(text) == None: ret_dict[text] = question return list(ret_dict.values()) def get_scraper_question_data(self, wait=1): #TODO docstring self.driver.get('https://www.okcupid.com/profile') time.sleep(wait*3) sameq = lambda q1, q2: q1.find_element_by_xpath('button/h3').text ==\ q2.find_element_by_xpath('button/h3').text self.driver.find_element_by_class_name('profile-selfview-questions-more').click() time.sleep(wait*3) questions = self.driver.find_elements_by_class_name('profile-question') i = 0 current = questions[i] datalist = [] while not sameq(current, questions[-1]): for j in range(i,len(questions)): qdatum = Scraper.get_data_from_answer_stub(questions[j]) datalist.append(qdatum) '''#open question detail overlay questions[j].click() #needs a moment to load time.sleep(0.7) #scrape question overlay. overlay = self.driver.find_element_by_class_name('questionspage') #get question text text = overlay.find_element_by_tag_name('h1').text #get the choices and what our answer was our_answer_buttons = overlay.find_element_by_class_name('pickonebutton-buttons')\ .find_elements_by_class_name('pickonebutton-button') choices = [b.text for b in our_answer_buttons] our_answer = list(map(lambda x: x.get_attribute('class')\ .endswith('--selected'),our_answer_buttons)).index(True) #get which of their answers we will accept their_answer_buttons = overlay.find_element_by_class_name('pickmanybuttons')\ .find_elements_by_tag_name('input') acceptable = list(map(lambda x: x.get_property('checked'), their_answer_buttons)) #get how important the question is to our scraper #should all be 1:somewhat important importance_buttons = overlay.find_element_by_class_name('importance-pickonebutton')\ .find_elements_by_tag_name('button') importance = list(map(lambda b: b.get_attribute('class').endswith('--selected'),\ importance_buttons)).index(True) #package up the question data datalist.append({ \ 'q_text': text, \ 'choices': choices, \ 'our_answer': our_answer, \ 'acceptable': acceptable, \ 'importance': importance \ }) #exit overlay self.driver.find_element_by_class_name('reactmodal-header-close').click()''' #adjust loop conditions current = questions[j] current.location_once_scrolled_into_view time.sleep(wait) questions = self.driver.find_elements_by_class_name('profile-question') for i in range(len(questions)): if sameq(current, questions[i]): break return datalist def collect_usernames(self, softlimit=np.inf): #TODO docstring self.driver.get('https://www.okcupid.com/match') time.sleep(2) usernames = set() while len(usernames) < softlimit: try: self.driver.find_element_by_class_name('blank-state-wrapper') exit_stat = 1 break except NoSuchElementException: try: self.driver.find_element_by_class_name('match-results-error') exit_stat = 0 break except NoSuchElementException: pass try: matchcards = self.driver.find_elements_by_class_name('usercard-thumb') last = matchcards[-1] usernames = usernames.union(set(map(\ lambda card: card.get_attribute('data-username'), matchcards))) last.location_once_scrolled_into_view except StaleElementReferenceException: time.sleep(0.5+np.random.exponential()) return (usernames, exit_stat) def scrape_user(self, img_save_dir, username, wait=1.5): #TODO docstring #TODO need try-accept block for when user isn't found #scrape questions first self.driver.get(f'https://www.okcupid.com/profile/{username}/questions') time.sleep(wait) #if there are any unanswered questions, answer them so we can scrape #ALL the user's answered questions later. if self.get_num_questions_by_filter('FIND OUT') > 0: qdata = self.answer_unanswered_questions() self.add_questions_update_version(qdata) #scrape the questions the user has answered questions = self.scrape_user_questions(username) #scrape their main profile contents self.driver.get(f'https://www.okcupid.com/profile/{username}') time.sleep(wait*np.random.exponential()) try: self.driver.find_element_by_class_name('profile-essays-expander').click() except NoSuchElementException: #short profiles pass html = self.driver.find_element_by_tag_name('HTML').get_attribute('innerHTML') #scrape images img_count = self.save_images(img_save_dir, username) dtime = datetime.now().strftime('%Y%m%d_%H%M%S'), #package it all up return{ \ '_id': username, \ 'html': html, \ 'img_count': img_count, \ 'questions': questions, \ 'metadata':{ \ 'time': dtime, \ 'scraper': self.name, \ 'scraper_version': self.version \ } \ } def answer_question_overlay(self, importance_answer=1, wait=0.1): #TODO docstring overlay = self.driver.find_element_by_class_name('questionspage') #get button arrays our_answer_buttons = overlay.find_element_by_class_name('pickonebutton-buttons')\ .find_elements_by_class_name('pickonebutton-button') their_answer_buttons = overlay.find_element_by_class_name('pickmanybuttons')\ .find_elements_by_tag_name('button') importance_buttons = overlay.find_element_by_class_name('importance-pickonebutton')\ .find_elements_by_tag_name('button') #get data to store text = overlay.find_element_by_tag_name('h1').text choices = [b.text for b in our_answer_buttons] answer = int(np.random.uniform() * len(choices)) acceptable_arr = [False]*len(choices) acceptable_arr[answer] = True #click the appropriate buttons our_answer_buttons[answer].click() time.sleep(wait) their_answer_buttons[answer].click() time.sleep(wait) importance_buttons[importance_answer].click() time.sleep(wait) #submit form self.driver.find_element_by_class_name('questionspage-buttons-button--answer')\ .click() return{ \ 'q_text': text, \ 'choices': choices, \ 'our_answer': answer, \ 'acceptable': acceptable_arr, \ 'importance': importance_answer \ } def answer_unanswered_questions(self, wait=1, importance_answer=1): #TODO docstring qdata = [] while self.get_num_questions_by_filter('FIND OUT') > 0: try: self.driver.find_element_by_class_name('profile-questions-filter-icon--findOut')\ .click() time.sleep(wait) self.driver.find_element_by_class_name('profile-question')\ .click() time.sleep(wait) qdata.append(self.answer_question_overlay(importance_answer)) time.sleep(wait) except NoSuchElementException: wait += 0.1 time.sleep(wait) continue return qdata def get_num_questions_by_filter(self, filterstr): #TODO docstring arr = self.driver.find_element_by_class_name('profile-questions-filters')\ .text.split('\n') return int(arr[arr.index(filterstr)+1]) def scroll_to_bottom(self, wait): #TODO docstring body = self.driver.find_element_by_tag_name('body') y = [0,1] while y[0] != y[1]: y[0] = y[1] body.send_keys(Keys.END) time.sleep(wait) y[1] = self.driver.execute_script('return window.pageYOffset;') def scrape_user_questions(self, username): #TODO docstring q=dict() for filterstr in ['AGREE', 'DISAGREE']: time.sleep(1) q[filterstr] = self.scrape_user_questions_by_filter(filterstr) return q def scrape_user_questions_by_filter(self, filterstr, wait=0.3): #TODO docstring self.driver.find_element_by_tag_name('body')\ .send_keys(Keys.HOME) time.sleep(0.7+wait) self.driver.find_element_by_class_name(f'profile-questions-filter-icon--{filterstr.lower()}')\ .click() time.sleep(0.7+wait) numQsToScrape = self.get_num_questions_by_filter(filterstr) self.scroll_to_bottom(wait) questions = self.driver.find_elements_by_class_name('profile-question') while len(questions) != numQsToScrape: wait += 0.1 self.scroll_to_bottom(wait) questions = self.driver.find_elements_by_class_name('profile-question') return [q.get_attribute('innerHTML') for q in questions] def get_src(img): #TODO docstring src = img.get_attribute('src') if src is None: src = img.get_attribute('data-src') return src def save_images(self, save_dir, username): #TODO docstring if not os.path.exists(save_dir): os.mkdir(save_dir) images = self.driver.find_element_by_class_name('profile-thumb')\ .find_elements_by_tag_name('img') images.extend(self.driver.find_element_by_class_name('profile-essays')\ .find_elements_by_tag_name('img')) for url in map(Scraper.get_src,images): i = requests.get(url).content name = urlparse(url).path.split('/')[-1] with open(f'{save_dir}/{username}_{name}', 'wb') as f: f.write(i) return len(images) def answer_all_questions(self, importance_answer=1, wait=1): #TODO docstring self.driver.get('https://www.okcupid.com/profile') time.sleep(2*wait+np.random.exponential()) self.driver.find_element_by_class_name('profile-selfview-questions-more')\ .click() time.sleep(2*wait+np.random.exponential()) qdata=[] self.driver.find_element_by_class_name('profile-questions-next-actions-button--answer')\ .click() count=0 while True: try: time.sleep(wait) qdatum = self.answer_question_overlay(importance_answer) qdata.append(qdatum) count += 1 except NoSuchElementException: try: self.driver.find_element_by_id('no-questions-blank-state') exit_stat = 'reached end of questions.' break except NoSuchElementException: wait += 0.1 time.sleep(wait) continue except Exception as e: exit_stat = f'Error: {str(e)}' break return (qdata, exit_stat) def answer_initial_question(self, wait=1): #TODO docstring qtext = self.driver.find_element_by_class_name('convoanswers-text')\ .text choicebuttons = self.driver\ .find_element_by_class_name('convoanswers-answers')\ .find_elements_by_tag_name('button') choicestext = [b.text for b in choicebuttons] answer = int(np.random.uniform() * len(choicestext)) choicebuttons[answer]\ .click() time.sleep(wait) choicebuttons = self.driver\ .find_element_by_class_name('convoanswers--theirs')\ .find_elements_by_tag_name('button') choicebuttons[answer]\ .click() time.sleep(wait) acceptable = [False]*len(choicestext) acceptable[answer] = True self.driver.find_element_by_class_name('convoquestion-continue')\ .click() return { \ 'q_text': qtext, \ 'choices': choicestext, \ 'our_answer': answer, \ 'acceptable': acceptable, \ 'importance': 1 \ } #TODO verify the assumed importance answer is right def answer_all_initial_questions(self, wait=1): #TODO docstring qdata = [] current_q, num_qs = self.get_progress() for i in range(num_qs - current_q+1): qdata.append(self.answer_initial_question(wait)) time.sleep(wait*2) return qdata def get_progress(self): #TODO docstring return tuple(map(int, self.driver\ .find_element_by_class_name('obqconvo-progress-text')\ .text.split(' of '))) def save_usernames_to_mongo(self, usernames): self.db.usernames.insert_many(map(lambda u: {'_id':u}, usernames)) def get_data_from_answer_stub(stub): text = stub.find_element_by_tag_name('h3').text answer_divs = stub.find_elements_by_class_name('profile-question-self-answers-answer') choices = [a.text for a in answer_divs] our_answer = [a.get_attribute('class').endswith('--isYourAnswer')\ for a in answer_divs].index(True) acceptable = [not a.get_attribute('class').endswith('--isUnacceptable') for a in answer_divs] return { \ 'q_text':text, \ 'choices': choices, \ 'our_answer': our_answer, \ 'acceptable': acceptable, \ 'importance': 1 \ }
from selenium.webdriver import Chrome # mode headless url = 'http://selenium.dunossauro.live/exercicio_01.html' navegador = Chrome() navegador.get(url) h1 = navegador.find_element_by_tag_name('h1').text ps = navegador.find_elements_by_tag_name('p') attrs = [] texts = [] for p in ps: attrs.append(p.get_attribute('atributo')) for p in ps: texts.append(p.text) print({h1: dict(zip(attrs, texts))})
#Função que encontra o titulo da pagina. def find_a_by_content(browser, content): elementos = browser.find_elements_by_tag_name('a') for elemento in elementos: if elemento.text == content: return elemento browser = Chrome() browser.get(url) # Pagina 1 sleep(3) main = browser.find_element_by_tag_name('main') main.find_element_by_tag_name('a').click() # Pagina 2 sleep(3) main = browser.find_element_by_tag_name('main') find_a_by_attr(main, 'attr', 'errado').click() # Pagina 3 sleep(10) browser.refresh() sleep(2) main = browser.find_element_by_tag_name('main') titulo = browser.title find_a_by_content(main, browser.title).click()
#最初に戻る back = driver.find_element_by_class_name("start") back.click() #図鑑に画面遷移 picbook = driver.find_element_by_class_name("picture_book") picbook.click() #図鑑で特定の生物を指定し、画面遷移 #setumei = driver.find_element_by_xpath("//input[@value='1']").click() setumei = driver.find_element_by_xpath( "//input[@src='../static/imgs/1.png']").click() #図鑑に戻る back = driver.find_element_by_class_name("start") back.click() #最初に戻る last = driver.find_element_by_tag_name("a") last.click() #タイトルが表示されるまでまつ time.sleep(3) #スクショをとる driver.save_screenshot('chrome_abunator_last.png') #ブラウザ閉じる driver.quit()
from selenium.webdriver import Chrome browser = Chrome() browser.get('https://selenium.dunossauro.live/aula_04_a.html') lista_nao_ordenada = browser.find_element_by_tag_name('ul') # 1 lis = browser.find_elements_by_tag_name('li') #2 lis[0].find_element_by_tag_name('a').text #3 """ 1 . buscamos ul 2. buscamos todos li 3. no primeiro li, buscamos `a` e pegamos o seu texto ul li a texto li a texto """
from selenium.webdriver import Chrome from time import sleep url = 'https://curso-python-selenium.netlify.app/exercicio_02.html#' navegador = Chrome() navegador.get(url) sleep(3) a = navegador.find_element_by_tag_name('a') p = navegador.find_elements_by_tag_name('p') a.click() if (p[1] == p[-1]): print(a.text) print(p.text)
def scroll_page(driver: Chrome, height: str): driver.find_element_by_tag_name("body").send_keys(Keys.END) sleep(3) if height != driver.execute_script("return document.body.scrollHeight"): scroll_page(driver, driver.execute_script("return document.body.scrollHeight"))
def test_input(browser: Chrome, enable_percy=False): """测试输入:: run template.basic_input() actions(['Continue']) template.background_input() # 或者 await template.coro_background_input() / flask_coro_background_input """ browser.find_element_by_css_selector('input').send_keys("22") browser.find_element_by_tag_name('form').submit() time.sleep(0.5) browser.find_element_by_css_selector('input').send_keys("secret") browser.find_element_by_tag_name('form').submit() time.sleep(0.5) browser.find_element_by_tag_name('form').submit() # checkbox time.sleep(0.5) browser.execute_script("arguments[0].click();", browser.find_element_by_css_selector('input')) browser.find_element_by_tag_name('form').submit() # Text Area time.sleep(0.5) browser.find_element_by_css_selector('textarea').send_keys(" ".join( str(i) for i in range(20))) browser.find_element_by_tag_name('form').submit() # file time.sleep(0.5) img_path = path.join(here_dir, 'assets', 'img.png') browser.find_element_by_css_selector('input').send_keys(img_path) browser.find_element_by_tag_name('form').submit() # text time.sleep(0.5) browser.find_element_by_css_selector('input').send_keys("text") browser.find_element_by_tag_name('form').submit() # 表单取消 time.sleep(0.5) browser.execute_script( "arguments[0].click();", browser.find_element_by_css_selector('.pywebio_cancel_btn')) # valid func, age in [10, 60] time.sleep(0.5) browser.find_element_by_css_selector('input').send_keys("1") browser.find_element_by_tag_name('form').submit() time.sleep(0.5) browser.find_element_by_css_selector('input').clear() browser.find_element_by_css_selector('input').send_keys("90") browser.find_element_by_tag_name('form').submit() time.sleep(0.5) browser.find_element_by_css_selector('input').clear() browser.find_element_by_css_selector('input').send_keys("23") browser.find_element_by_tag_name('form').submit() # code time.sleep(0.5) # browser.find_element_by_css_selector('textarea').send_keys(" ".join(str(i) for i in range(20))) browser.find_element_by_tag_name('form').submit() # Cancelable from group time.sleep(0.5) browser.find_element_by_name('name').send_keys("name") time.sleep(1) browser.find_element_by_name('age').send_keys("90") browser.find_element_by_tag_name('form').submit() browser.execute_script( '$("html, body").scrollTop( $(document).height()+100);') time.sleep(0.5) enable_percy and percySnapshot(browser=browser, name='input group invalid') time.sleep(0.5) browser.find_element_by_name('age').clear() browser.find_element_by_name('age').send_keys("23") browser.find_element_by_tag_name('form').submit() # callback actions time.sleep(0.5) browser.execute_script( "arguments[0].click();", browser.find_element_by_css_selector('form button[type="button"]')) time.sleep(0.4) # input action time.sleep(0.5) browser.execute_script( "arguments[0].click();", browser.find_element_by_css_selector('form button[type="button"]')) time.sleep(0.4) browser.find_element_by_tag_name('form').submit() # Input group time.sleep(0.5) browser.execute_script( '$("html, body").scrollTop( $(document).height()+100);') time.sleep(0.5) enable_percy and percySnapshot(browser=browser, name='input group all') browser.find_element_by_name('text').send_keys("name") browser.find_element_by_name('number').send_keys("20") browser.find_element_by_name('float').send_keys("3.1415") browser.find_element_by_name('password').send_keys("password") browser.find_element_by_name('textarea').send_keys(" ".join( str(i) for i in range(20))) # browser.find_element_by_css_selector('[name="code"]').send_keys(" ".join(str(i) for i in range(10))) Select(browser.find_element_by_name('select-multiple')).select_by_index(0) # browser. find_element_by_css_selector('[name="select"]'). send_keys("name") # browser. find_element_by_css_selector('[name="checkbox-inline"]'). send_keys("name") # browser. find_element_by_css_selector('[name="checkbox"]'). send_keys("name") # browser. find_element_by_css_selector('[name="radio-inline"]'). send_keys("name") # browser. find_element_by_css_selector('[name="radio"]'). send_keys("name") browser.find_element_by_name('file_upload').send_keys( path.join(here_dir, 'assets', 'helloworld.txt')) browser.execute_script("$('form button').eq(1).click()") time.sleep(1) browser.execute_script( '$("html, body").scrollTop( $(document).height()+100);') time.sleep(0.5) enable_percy and percySnapshot(browser=browser, name='input group all invalid') browser.find_element_by_name('password').clear() browser.find_element_by_name('password').send_keys("123") browser.execute_script("$('form button').eq(1).click()") time.sleep(1) browser.execute_script( '$("html, body").scrollTop( $(document).height()+100);') time.sleep(1) enable_percy and percySnapshot(browser=browser, name='input group all submit') browser.find_element_by_css_selector('form').submit() # background time.sleep(3) get_visible_form(browser).find_element_by_css_selector('input').send_keys( "background") get_visible_form(browser).find_element_by_tag_name('form').submit() # front time.sleep(0.5) get_visible_form(browser).find_element_by_css_selector('input').send_keys( "front") get_visible_form(browser).find_element_by_tag_name('form').submit()
def son(): print('월클손흥민') ## 유투브 재생을 위한 링크, 제목 크롤링 ## chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome('./chromedriver.exe', options=chrome_options) driver.get("https://www.youtube.com/") time.sleep(1) driver.find_element_by_xpath('//*[@id="search"]') driver.find_element_by_xpath('//*[@id="search"]').send_keys('KBO 레전드') driver.find_element_by_xpath('//*[@id="search"]').send_keys(Keys.ENTER) time.sleep(1) url = driver.current_url # url = "https://www.youtube.com/results?search_query=KBO+%EB%A0%88%EC%A0%84%EB%93%9C" # print(url) response = urllib.request.urlopen(url) soup = BeautifulSoup(response, 'lxml') # print(response) results = soup.select('h3 > a') # print(type(results)) result = results[0:10] # print(results) kbo_title=[] kbo_link=[] kbo_commentlink=[] ##댓글을 크롤링을 위한 주소 for video in result: # print(video) # link = video.attrs['href'].replace('/watch?v=','/embed/') link = video.attrs['href'] #링크 크롤링 title = video.attrs['title'] #제목 크롤링 # print(link, title) kbo_commentlink.append(link) kbo_link.append(link.replace('/watch?v=','/embed/')) kbo_title.append(title) # print(kbo_link) # print(kbo_commentlink) driver.close() with open('./kbo_title_'+ str(time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))) +'.json','wt') as f: json.dump(kbo_title,f) with open('./kbo_link_'+ str(time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))) +'.json','wt') as f: json.dump(kbo_link,f) ### 유투브 댓글 크롤링 ### kbo_comments=[] for li in kbo_commentlink: delay = 3 browser = Chrome() browser.implicitly_wait(delay) start_url="https://www.youtube.com" + li browser.get(start_url) browser.maximize_window() print(start_url) time.sleep(3) body = browser.find_element_by_tag_name('body') pagedowns = 2 #2번 밑으로 while pagedowns: body.send_keys(Keys.PAGE_DOWN) time.sleep(2) pagedowns -= 1 time.sleep(3) # print('@@@@@@@여기까지 정상') html0 = browser.page_source soup = BeautifulSoup(html0, 'lxml') comment_list = soup.find_all('yt-formatted-string', id='content-text', limit=5) comments=[] for list in comment_list: comment = list.text comments.append(comment) # 영상마다 1차 리스트 생성 print(comments,'@@@1차리스트@@') kbo_comments.append(comments) # 영상 다 합쳐서 2차 리스트 생성 browser.close() print(kbo_comments, '####2차리스트####') with open('./kbo_comments_'+ str(time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))) +'.json','wt') as f: json.dump(kbo_comments,f)