def walk_on_site(driver: Chrome): for i in range(randint(5, 15)): try: links = driver.find_elements_by_tag_name('a') action = ActionChains(driver) link = choice(links) action.move_to_element(link) action.perform() do_delay('fast') link.click() sleep(1) driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE) do_delay('fast') for i in range(randint(5, 40)): try: random_div = choice( driver.find_elements_by_tag_name('div')) action.move_to_element(random_div) action.perform() do_delay('fast') driver.execute_script( f"window.scrollTo(0, {randint(1, 500)});") except: pass sleep(1) except Exception as e: print(e)
class Soundcloud_w(): def __init__(self): # Create a headless browser opts = Options() #opts.set_headless() self.browser = Chrome(options=opts) self.browser.get(SOUNDCLOUD_LIKES) # Track list related state self._current_track_number = 1 def play(self, track=None): ''' Play a track. If no track number is supplied, the presently selected track will play. ''' self.browser.execute_script( "arguments[0].click();", self.browser.find_element_by_class_name('playControl')) def pause(self): ''' Pauses the playback ''' self.play() def close_w(self): ''' Closes the the Chrome Soundcloud window ''' self.browser.quit()
def switch_to_sn_default_comments_page(browser: Chrome, shop_url: str): open_second_window(browser) print('------打开新窗口并正在加载默认评论页面------') browser.get(shop_url + '#productCommTitle') browser.execute_script('document.querySelector("#productCommTitle > a:nth-child(1)").click()') print('------默认评论页面加载完成------') waiting_content_loading(browser, 'rv-target-item')
class DoutulaDownloaderMiddleware(object): # @classmethod # def from_crawler(cls, crawler): # # This method is used by Scrapy to create your spiders. # s = cls() # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) # return s def __init__(self, timeout=10): # self.browser = PhantomJS() self.browser = Chrome() self.timeout = timeout self.wait = WebDriverWait(self.browser, self.timeout) def process_request(self, request, spider): self.browser.get(request.url) self.wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'span.page-link'))) self.browser.execute_script( 'document.getElementsByClassName("page-link").scrollIntoView(True)' ) time.sleep(5) return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, status=200, encoding='utf-8') def __del__(self): self.browser.close()
def recent_post_links(username, post_count=10): """ With the input of an account page, scrape the 10 most recent posts urls Args: username: Instagram username post_count: default of 10, set as many or as few as you want Returns: A list with the unique url links for the most recent posts for the provided user """ url = "https://www.instagram.com/" + username + "/" chrome_options = Options() chrome_options.add_argument("--headless") browser = Chrome(options=chrome_options) browser.get(url) post = 'https://www.instagram.com/p/' post_links = [] while len(post_links) < post_count: links = [a.get_attribute('href') for a in browser.find_elements_by_tag_name('a')] for link in links: if post in link and link not in post_links: post_links.append(link) scroll_down = "window.scrollTo(0, document.body.scrollHeight);" browser.execute_script(scroll_down) time.sleep(5) else: browser.stop_client() return post_links[:post_count]
def get_html_with_js(url, web_driver_path): """ Get html from url that uses Javascript to load """ driver = Chrome(web_driver_path) driver.get(url) SCROLL_PAUSE_TIME = 2 # Get scroll height last_height = driver.execute_script("return document.documentElement.scrollHeight") while True: # Scroll down to bottom driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script("return document.documentElement.scrollHeight") if new_height == last_height: break last_height = new_height # Get all html and close html = driver.page_source driver.close() # with open('res.html','w',encoding='utf-8') as f: # f.write(html) return html
def process_request(self, request, spider): options = ChromeOptions() options.headless = True driver = Chrome(options=options) driver.implicitly_wait(20) driver.get('https://gaudiy.com/community_details/avJEInz3EXlxNXKMSWxR') time.sleep(0.3) input_element = driver.find_elements_by_css_selector('span:nth-child(5) > button > span > p')[0] if input_element: input_element.click() time.sleep(0.3) nft_element = driver.find_elements_by_css_selector('span.MuiTab-wrapper')[0] if nft_element: nft_element.click() source_element = driver.find_element_by_css_selector('label.MuiFormControlLabel-root') if source_element: # source_element.click() time.sleep(1.0) link = driver.find_elements_by_css_selector('button > div > p:nth-child(1)')[-2] driver.execute_script("arguments[0].scrollIntoView(true);", link) time.sleep(0.3) while link != driver.find_elements_by_css_selector('button > div > p:nth-child(1)')[-2]: link = driver.find_elements_by_css_selector('button > div > p:nth-child(1)')[-2] driver.execute_script("arguments[0].scrollIntoView(true);", link) time.sleep(0.3) return HtmlResponse( driver.current_url, body=driver.page_source, encoding='utf-8', request=request, ) time.sleep(0.5) driver.quit()
def get_options(driver: webdriver.Chrome, item: Text) -> Dict[Text, Dict[Text, Any]]: """Get all add-ons/options. Args: driver (Any): chrome driver item (Text): item page url Returns: Dict[Text, Dict[Text, Any]]: a dictionary of option, option info dict pairs """ wait = WebDriverWait(driver, WAIT_TIME) options_dict = {} try: button_xpath = '//button[@class="sb-editField__button text-bold"]' # option_category_xpath = '//div[@class="frapPadding optionsSection__1eeR7"]/div/h2' # move down to get all options in view driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") wait.until(EC.visibility_of_all_elements_located((By.XPATH, button_xpath))) edit_buttons = driver.find_elements_by_xpath(button_xpath) # click into each set of options options_dict = get_option_data(driver, edit_buttons) except Exception as e: logger.error(f"There was a problem getting the options for {item}") logger.error(e) return options_dict
def get_option_data(driver: webdriver.Chrome, edit_buttons: Any) -> Dict[Text, Dict[Text, Any]]: """Open each option field and get option data. Args: driver: chrome driver edit_buttons: webelements of the options 'edit' buttons Returns: Dict[Text, Dict[Text, Any]]: A dictionary of option name, option data pairs """ wait = WebDriverWait(driver, WAIT_TIME) done_xpath = '//button[@data-e2e="doneFrap"]' option_elements_xpath = '//div[@class="selectLine___2LyZE"]/div[1]/div[1]' options_dict = {} for button in edit_buttons: button.click() wait.until(EC.visibility_of_element_located((By.XPATH, done_xpath))) # TODO: fix, not finding headers # option_categories = [cat.text for cat in driver.find_elements_by_xpath(option_category_xpath)] # logger.error(option_categories) options_dict.update(get_selection(driver, driver.find_elements_by_xpath(option_elements_xpath))) driver.find_element_by_xpath(done_xpath).click() driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") return options_dict
def get_images(title): try: link = get_link(title) chrome_options = Options() chrome_options.add_argument("--headless") driver = Chrome(chrome_options=chrome_options) driver.get(link) for i in range(1): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") sleep(2) sleep(5) bsObj = BeautifulSoup(driver.page_source,"html.parser") linkToMain = bsObj.find_all("div",{"class":"photo-list-photo-view"}) images=[] for i in linkToMain: style = i.attrs["style"] url = style.find("url(") style = style[url+5:-3] images.append("https:"+style) print(len(images),images,sep='\n') driver.close() return images except: return 1
def print_pdf_save_as(driver: webdriver.Chrome, browser="chrome", path_pdf="file.pdf"): # pragma: no cover """ Print and save the Web page HTML as PDF file in Chrome. Use handle_save_as for handle the Save as dialog window. Using when webdriver options is NOT headless. NOTE: Adjust the time.sleep according to your Environment Chrome Locales: EN """ try: driver.execute_async_script("window.print();") except SeleniumTimeoutException as err: pass # Wait for render pdf preview time.sleep(60) driver.switch_to.window(driver.window_handles[1]) try: now = time.time() while True: expired = time.time() > now + 30 dropdown: WebElement = driver.execute_script(( "return document.querySelector('print-preview-app')." "shadowRoot.querySelector('print-preview-sidebar')." "shadowRoot.querySelector('print-preview-destination-settings')." "shadowRoot.querySelector('print-preview-destination-select')." "shadowRoot.querySelector('select.md-select');")) if dropdown: break if expired: raise TimeoutException("Timeout ocurred!") # 'Save as PDF/local/' _value = dropdown.get_attribute("value") if _value and _value[:4] != "Save": dropdown.click() time.sleep(0.5) # Down until Save as PDF dropdown.send_keys(Keys.ARROW_DOWN) # Wait for Re-render pdf preview time.sleep(90) save = driver.execute_script( ("return document.querySelector('print-preview-app')." "shadowRoot.querySelector('print-preview-sidebar')." "shadowRoot.querySelector('print-preview-button-strip')." "shadowRoot.querySelector('cr-button.action-button');")) assert save save.click() # Handle Save as dialog in Windows # Wait for Save as dialog time.sleep(30) handle_save_as(browser, path_pdf) except TimeoutException as err: print(err) except NoSuchElementException: print("Error printing to PDF!") driver.switch_to.window(driver.window_handles[0])
def toIdc(driver: webdriver.Chrome): driver.execute_script("window.open('http://idcenter.box.zonghengke.com/')") driver.switch_to.window(driver.window_handles[-1]) if (len(driver.find_elements_by_id("in_user_Nm")) > 0): driver.find_element_by_id("in_user_Nm").send_keys("gaowenbo") driver.find_element_by_id("in_password").send_keys("YKUacrVjlfoR") driver.find_element_by_id("sign_in").click()
def _scrape_uploads(cls, driver: webdriver.Chrome, uploads_url: str) -> str: logger = logging.getLogger("_scrape_uploads") logger.info("loading uploads page...: " + uploads_url) driver.get(uploads_url) # now we have to load all the videos, by repeatedly clicking the "show more" button. load_cnt = 0 while True: try: # try getting the show more button show_more_button = WebDriverWait(driver, cls.TIME_OUT).until( e_c.element_to_be_clickable( (By.CLASS_NAME, cls.SHOW_MORE_CLASS))) except TimeoutException as nse: logger.debug(str(nse)) # on timeout, break the loop break else: # scroll down driver.execute_script( "window.scrollTo(0,document.body.scrollHeight)") # click the button show_more_button.click() # while load more button is clickable.. load_cnt += 1 logger.info("loading uploads #" + str(load_cnt)) # return the source uploads_html = driver.page_source return uploads_html
class BaseDataClass(): def __init__(self): self.status = 'Init' self.webdriver = Chrome(chrome_options=chrome_options) self.firstname = ''.join( random.choice(string.ascii_lowercase) for i in range(7)) self.lastname = ''.join( random.choice(string.ascii_lowercase) for i in range(7)) self.login = f'{self.firstname}A{self.lastname}' self.pswd = ''.join( random.choice(string.ascii_lowercase + string.digits) for i in range(18)) self.epicgames_login = self.login[:15] self.epicgames_pswd = self.pswd def save_to_log_file(self): print(self.status) for i in self.__dict__: logging.info(f' {i} - {getattr(self, i)}') logging.info(' ' + '-' * 100) def open_new_tab(self): self.webdriver.get('chrome://settings') self.webdriver.execute_script("window.open()") def close_webdriver(self): self.webdriver.close()
def getdata(url, base_url): option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) option.add_argument( 'user-agent="Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Mobile Safari/537.36"' ) option.add_argument('--headless') option.add_argument('--disable-gpu') #无头浏览器 driver = Chrome(options=option) driver.get(url) driver.implicitly_wait(3) driver.find_element_by_xpath('//*[@id="promoBannerIndex"]/a[2]').click() time.sleep(3) driver.find_element_by_xpath( '//*[@id="indexContainer"]/div/div[1]/div[2]/a[3]').click() time.sleep(3) # time.sleep(3000) for i in range(5): driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') # print("下滑",i+1,"页") time.sleep(2) tree = etree.HTML(driver.page_source) #获取相应链接 gid_lists = tree.xpath( '//div[@class="list_content"]/section/@data-item-id') #获取文章内容 # print("新闻数为",len(gid_lists)) driver.quit() return gid_lists
class LianjiaDownloadMiddleware(object): def __init__(self, timeout=15): self.timeout = timeout self.browser = Chrome() self.browser.maximize_window() self.browser.set_page_load_timeout(self.timeout) self.wait = WebDriverWait(self.browser, self.timeout) def __del__(self): self.browser.close() @classmethod def from_crawler(cls, crawler): return cls(timeout=crawler.settings.get('TIMEOUT')) def process_request(self, request, spider): logging.info('*****访问目标网页*****') print('~~~~~~~~~~~~当前URL~~~~~~~~~~~~~~~', request.url) try: self.browser.get(request.url) time.sleep(random.randint(6, 9)) script = 'window.scrollTo(0, document.body.scrollHeight)' time.sleep(random.randint(3,5)) self.browser.execute_script(script=script) return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf8') except TimeoutException: return HtmlResponse(url=request.url, status=500, request=request)
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. browser = Chrome() browser.get('https://so.gushiwen.cn/user/login.aspx') browser.execute_script("document.body.style.zoom='0.8'" ) # win10系统显示设置默认缩放125%,因此为了截图需要将浏览器缩放80%来复原 browser.maximize_window() browser.get_screenshot_as_file('code.png') code = browser.find_element_by_xpath('//img[@id="imgCode"]') left = int(code.location['x']) top = int(code.location['y']) right = left + int(code.size['width']) bottom = top + int(code.size['height']) im = Image.open('code.png') im = im.crop((left, top, right, bottom)) im.save('code1.png') code_str = rec_code('code1.png') login(browser, code_str) listcookies = browser.get_cookies() print(listcookies) browser.close() request.cookies = listcookies # 设置请求的cookies信息 # request.cookies = get_cookie() # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None
def get_vote_dict(driver: webdriver.Chrome, js_dict: dict, members: list, check_interval: int = 10) -> dict: def get_arr(s): if '없습니다' in s: return [] else: return s.split() driver.get(BASE_URL) sleep(1) vote_dict = {} for i, (bill_no, js) in enumerate(js_dict.items()): driver.execute_script(js) sleep(2) pro, con, wdr, _ = [ get_arr(x.text) for x in driver.find_elements_by_tag_name('tbody') ] vote_dict[bill_no] = get_vote_row(members, pro, con, wdr) driver.back() sleep(1) if i % check_interval == 0: print('Complete: {} of {}'.format(i, inspect.stack()[0][3])) print('Complete: {}'.format(inspect.stack()[0][3])) return vote_dict
class Driver: def __init__(self): """Initialize chrome driver""" self.webdriver = r"drive/chromedriver" self.driver = Chrome(self.webdriver) def get_info(self, url): """Set up Beautiful Soup""" self.driver.get(url) self.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") soup = BeautifulSoup(self.driver.page_source, "html.parser") return soup def next_page(self, i, pl): """Allow to navigate between pages""" url = pl + '#page=' + str(i + 2) + '&perPage=12' city_soup = self.get_info(url) return city_soup def close(self): """Closes the driver""" self.driver.close() def quit(self): """Quits the driver""" self.driver.quit()
def switch_to_jd_sku_comments_page(browser: Chrome, sku_url: str): open_second_window(browser) print('------打开新窗口并正在加载当前SKU默认评论页面------') browser.get(sku_url + '#comment') browser.execute_script('document.getElementById("comm-curr-sku").click()') print('------当前SKU默认评论页面加载完成------') waiting_content_loading(browser, 'comment-item')
def retrieveTerabytePrices(strBusca): #opts = Options() #opts.headless = True chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("start-maximized") chrome_options.add_argument( '--user-agent="Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166"' ) chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') script = ''' Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) ''' driver = Chrome(desired_capabilities=chrome_options.to_capabilities(), chrome_options=chrome_options, executable_path='C://chromedriver.exe') # options.add_argument("--headless") driver.execute_script(script) dictGPU = {} try: driver.get('https://www.terabyteshop.com.br/') textField = driver.find_element_by_xpath("//input[@id='isearch']") for i in strBusca: textField.send_keys(i) time.sleep(0.3) time.sleep(5) textField.submit() time.sleep(10) #driver.find_element_by_xpath("(//button[@aria-label='botão buscar'])[1]") time.sleep(2) divGPU = driver.find_elements_by_xpath("//div[@id='prodarea']") for i in range(len(divGPU)): gpuName = driver.find_element_by_xpath( "(//div[@class='commerce_columns_item_caption'])[" + str(i + 1) + "]//strong").text gpuPrice = driver.find_element_by_xpath( "(//div[@class='prod-new-price'])[" + str(i + 1) + "]/span") if gpuPrice.is_displayed: gpuPrice = gpuPrice.text gpuAvailability = True else: gpuPrice = None gpuAvailability = False dictGPU[gpuName] = { "price": gpuPrice, "availability": gpuAvailability } finally: driver.quit() return dictGPU
class Browser: def __init__(self, site_url, driverfile): self.driver = Chrome(driverfile) self.base_url = site_url random.seed() def getDriver(self): return self.driver def openURL(self, url, delay=0): self.driver.get(url) self.wait(delay) def openSubPath(self, subpath, delay=0): if not subpath.startswith('/'): subpath = '/' + subpath self.openURL(self.base_url + subpath, delay) def quit(self): self.driver.close() def goBack(self): self.driver.back() def goForward(self): self.driver.forward() def scrollDown(self): self.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") self.wait(1) def wait(self, seconds): if (seconds > 0): time.sleep(seconds) def microdelay(self): time.sleep(abs(round(random.gauss(0.3, 0.2), 2))) defaulttimeout = 8 def getElementBy(self, method, query_string, timeout): try: return WebDriverWait(self.driver, timeout).until( EC.presence_of_element_located((method, query_string))) except: return None def getElementByXPath(self, xpath, timeout=defaulttimeout): return self.getElementBy(By.XPATH, xpath, timeout) def getElementByTag(self, tag, timeout=defaulttimeout): return self.getElementBy(By.TAG_NAME, tag, timeout) def getElementsByTag(self, tag): return self.driver.find_elements_by_tag_name(tag) def getBaseURL(self): return self.base_url
def ScrollToElem(driver: webdriver.Chrome, el: WebElement) -> None: """Scrolls the page to put the specified element in view Args: driver (webdriver): The page's webdriver el (WebElement): The element to put into view """ driver.execute_script('arguments[0].scrollIntoView({block:"center"});', el)
def get_jd_comments(browser: Chrome, jd_ss: Union[Shop, JDSku], get_sku: bool = False, sku_mode: bool = False, summary: bool = False): max_page = 141 while max_page > 0: try: # 获取当前页面的评论 if sku_mode is True: jd_comments_url = 'skuProductPageComments' else: jd_comments_url = 'productPageComments' jd_comments = get_response_body(browser, jd_comments_url, 'GET') if jd_comments is None: print('---未找到评论接口数据---') break jd_comments = jd_comments.lstrip('fetchJSON_comment98(').rstrip(');') jd_comments = json.loads(jd_comments) # 保存评论 comment_list = jd_comments['comments'] insert_jd_comments(comment_list, jd_ss) if len(comment_list) == 0: print('该页评论数据0条') break # 遍历评论中的所有SKU if get_sku is True: get_sku_from_jd_comments(comment_list, jd_ss) except WebDriverException: print('---此页评论数据获取异常(WebDriverException), 跳过此分类---') break # 赋值最大页数 if max_page == 141: max_page = jd_comments['maxPage'] if sku_mode and summary: sku_summary = jd_comments['productCommentSummary'] first_comment = comment_list[0] insert_jd_model_summary(sku_summary, first_comment, jd_ss) elif summary is True: total_summary = jd_comments['productCommentSummary'] insert_jd_comment_summary(total_summary, jd_ss) # 最后一页就不下滑了 max_page -= 1 print(f'本轮剩余页数: {max_page}') if max_page == 0: break # 下滑点击下一页 while True: try: WebDriverWait(browser, 0.5).until( ec.element_to_be_clickable((By.CLASS_NAME, 'ui-pager-next')) ) browser.execute_script('document.getElementsByClassName("ui-pager-next")[0].click()') waiting_content_loading(browser, 'comment-item') break except TimeoutException: window_scroll_by(browser, 200) back_to_first_window(browser) print('------当前浏览器窗口已关闭, 暂停10秒------') sleep(10)
def scroll_infinitely(driver: webdriver.Chrome, pause_time_seconds:float = 0.5): last_height = driver.execute_script("return document.body.scrollHeight") while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(pause_time_seconds) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height
def scroll_gradually(driver: webdriver.Chrome, pause_time_seconds=0.5, scroll_amount=200): prev_height = driver.execute_script("return window.pageYOffset") while True: driver.execute_script("window.scrollTo(0, {0});".format(prev_height + scroll_amount)) time.sleep(pause_time_seconds) cur_height = driver.execute_script("return window.pageYOffset") if prev_height == cur_height: break prev_height = cur_height
def check_driver(): try: driver = Chrome( os.path.join(CHROME_DRIVER_DIR_PATH, "chromedriver.exe")) driver.get("https://www.google.co.jp") driver.execute_script("alert('ダウンロードが成功しました。画面を閉じてください')") print("正常に起動しました") time.sleep(100) except Exception as e: print(f"起動エラー:{e}")
def turn_to_the_next_page(browser: Chrome): while True: try: WebDriverWait(browser, 0.5).until( ec.element_to_be_clickable((By.CLASS_NAME, 'more'))) browser.execute_script('document.querySelector(".more").click()') waiting_content_loading(browser, 'common') break except TimeoutException: window_scroll_by(browser, 500)
def scroll_screen(driver: Chrome, stime: int = 5, sleep: int = 1, step: int = 1000) -> None: value = 0 for i in range(stime): script = "scrollBy(" + str(value) + ",+%s);" % (step) driver.execute_script(script) value += step time.sleep(sleep)
def fetch(): options = ChromeOptions() options.add_argument('headless') driver = Chrome( executable_path= "/Users/tuanthanhtran/Desktop/training/bug-bounty/hackerone-reports/chromedriver", options=options) reports = [] with open('data.csv', 'r', newline='', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: reports.append(dict(row)) first_report_link = reports[0]['link'] driver.get(hacktivity_url) driver.implicitly_wait(page_loading_timeout) counter = 0 page = 0 last_height = driver.execute_script("return document.body.scrollHeight") while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(page_loading_timeout) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: counter += 1 if counter > 1: break else: counter = 0 last_height = new_height raw_reports = driver.find_elements_by_class_name('fade') new_reports = extract_reports(raw_reports) found = False for i in range(len(new_reports)): if new_reports[i]['link'] == first_report_link: reports = new_reports[:i] + reports found = True break if found: break page += 1 print('Page:', page) driver.close() with open('data.csv', 'w', newline='', encoding='utf-8') as file: keys = reports[0].keys() writer = csv.DictWriter(file, fieldnames=keys) writer.writeheader() writer.writerows(reports)
def start_callback(): """ Main loop of the scrape. """ profile_username = E_username.get() # The Instagram username of the profile from which we # are downloading. Must be supplied. output_directory = E_path.get() # Will be initialized with the optional argument or a # default later. update_mode = True serialize = True latest_image = '' # The latest downloaded images will be the first in the directory. files = os.listdir(output_directory) if files: latest_image = files[0] # Start the browser driver = Chrome(executable_path='../bin/chromedriver') driver.get(insta_url + profile_username) # Find the number of posts on this Instagram profile post_count_tag_xpath = ('//*[@id="react-root"]/section/main/' + 'article/header/div[2]/ul/li[1]/span/span') post_count_tag = driver.find_element_by_xpath(post_count_tag_xpath) post_count = int(post_count_tag.text.replace(',', '')) # If the target profile is private, then redirect to the login page login_tag_xpath = '//*[@id="react-root"]/section/main/article/div/p/a' try: login_tag = driver.find_element_by_xpath(login_tag_xpath) login_page_url = login_tag.get_attribute('href') driver.get(login_page_url) # Wait for the user to login while driver.current_url == login_page_url: sleep(1) # Return to the target profile from the homepage driver.get(insta_url + profile_username) except: pass # Click the 'Load More' element driver.find_element_by_class_name('_oidfu').click() # Load all the posts into the browser processed = 0 while processed < post_count: # Load more content by scrolling to the bottom of the page driver.execute_script("window.scrollTo(0, 0);") driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Download 4 rows of items (4 rows are loaded upon each scroll) and # remove them from view for _ in itertools.repeat(None, 4): urls = fetch_row_links(driver) delete_row(driver) for url in urls: # Exit if we've reached the latest image that was in the # directory before downloading. This means the directory has # everything beyond this point. if update_mode: fname = file_name.search(url).group(0) if fname in latest_image: exit(0) download_from_url(url, output_directory, serialize, post_count-processed) processed += 1 driver.close()
class SeleniumTestCase(LiveServerTestCase): """ Base class for Selenium tests. Allows tests to be written independently of which browser they're going to be run in. """ @classmethod def appium_command_executor(cls): """ Get the command executor URL for iOS simulator testing """ if hasattr(cls, '_appium_executor'): return cls._appium_executor # Get the address iWebDriver will connect to address = None try: address = socket.gethostbyname(socket.gethostname()) except: # Use default address defined below pass # If we don't have an address we should use localhost if not address: address = '127.0.0.1' port = 4723 cls._appium_executor = "".join(["http://", address, ":", str(port), '/wd/hub']) return cls._appium_executor @classmethod def setUpClass(cls): # Create the screenshots directory if it doesn't exist yet screenshot_dir = settings.SELENIUM_SCREENSHOT_DIR if screenshot_dir and not os.path.exists(screenshot_dir): os.makedirs(screenshot_dir) super(SeleniumTestCase, cls).setUpClass() @classmethod def tearDownClass(cls): super(SeleniumTestCase, cls).tearDownClass() def setUp(self): """ Start a new browser instance for each test """ self._screenshot_number = 1 self.browser = os.getenv('SELENIUM_BROWSER', settings.SELENIUM_DEFAULT_BROWSER) if os.getenv('SELENIUM_HOST'): self.sel = self.sauce_labs_driver() elif self.browser == 'firefox': self.sel = Firefox() elif self.browser == 'htmlunit': self.sel = RemoteWebDriver(desired_capabilities=DesiredCapabilities.HTMLUNITWITHJS) elif self.browser in ['ios', 'ipad', 'ipod', 'iphone']: capabilities = { 'app': 'safari', 'browserName': '', 'device': 'iPhone Simulator', 'os': 'iOS 6.1' } self.sel = RemoteWebDriver(command_executor=self.appium_command_executor(), desired_capabilities=capabilities) elif self.browser == 'opera': self.sel = RemoteWebDriver(desired_capabilities=DesiredCapabilities.OPERA) elif self.browser == 'iexplore': self.sel = RemoteWebDriver(desired_capabilities=DesiredCapabilities.INTERNETEXPLORER) elif self.browser == 'phantomjs': self.sel = PhantomJS(service_args=['--debug=true', '--webdriver-loglevel=DEBUG']) elif self.browser == 'safari': # requires a Safari extension to be built from source and installed self.sel = RemoteWebDriver(desired_capabilities=DesiredCapabilities.SAFARI) else: self.sel = Chrome() self.sel.set_page_load_timeout(settings.SELENIUM_PAGE_LOAD_TIMEOUT) # Give the browser a little time; Firefox throws random errors if you # hit it too soon time.sleep(1) def tearDown(self): # Check to see if an exception was raised during the test info = sys.exc_info() passed = info[0] is None if not passed: # Want to see what went wrong self.screenshot() self.report_status(passed) if hasattr(self, 'sel'): self.sel.quit() super(SeleniumTestCase, self).tearDown() # ~~~~~~~~~~~~~~~~~~~~~~~~~ Selenium operations ~~~~~~~~~~~~~~~~~~~~~~~~~~ def assert_hidden(self, selector): element = self.wait_for_element(selector) msg = "'%s' should not be visible" % selector assert not element.is_displayed(), msg def assert_not_present(self, selector): assert_raises(NoSuchElementException, self.sel.find_element_by_css_selector, selector) def assert_not_visible(self, selector): """ Ok if it's either missing or hidden """ try: element = self.sel.find_element_by_css_selector(selector) except NoSuchElementException: return msg = "'%s' should not be visible" % selector assert not element.is_displayed(), msg def assert_text_not_in_element(self, selector, text): """ Verify that the specified element does not contain certain text """ msg = "'%s' should not contain the text '%s'" % (selector, text) content = self.sel.find_element_by_css_selector(selector).text assert text not in content, msg def assert_visible(self, selector): element = self.wait_for_element(selector) msg = "'%s' should be visible" % selector assert element.is_displayed(), msg def audit_accessibility(self): """ Check for accessibility violations using the JavaScript library from Chrome's Developer Tools. """ # First add the library to the page script = '' for line in ADD_ACCESSIBILITY_SCRIPT.splitlines(): script += line.strip() self.sel.execute_script(script) # Wait for the script to finish loading self.wait_for_condition('return axs.AuditRule.specs.videoWithoutCaptions !== "undefined";') # Now run the audit and inspect the results self.sel.execute_script('axs_audit_results = axs.Audit.run();') failed = self.sel.execute_script('return axs_audit_results.some(function (element, index, array) { return element.result === "FAIL" });') if failed: report = self.sel.execute_script('return axs.Audit.createReport(axs_audit_results);') raise self.failureException(report) def click(self, selector): """ Click the element matching the selector (and retry if it isn't visible or clickable yet) """ element = self.wait_for_element(selector) element_was_clicked = lambda driver: lambda_click(element) msg = "The element matching '%s' should be clickable" % selector Wait(self.sel).until(element_was_clicked, msg) return element def click_link_with_text(self, text): link_is_present = lambda driver: driver.find_element_by_link_text(text) msg = "A link with text '%s' should be present" % text link = Wait(self.sel).until(link_is_present, msg) link.click() return link def click_link_with_xpath(self, xpath): link_is_present = lambda driver: driver.find_element_by_xpath(xpath) msg = "A link with xpath '%s' should be present" % xpath link = Wait(self.sel).until(link_is_present, msg) link.click() return link def enter_text(self, selector, value): field = self.wait_for_element(selector) field.send_keys(value) self.screenshot() return field def enter_text_via_xpath(self, xpath, value): field = self.wait_for_xpath(xpath) field.send_keys(value) self.screenshot() return field def get(self, relative_url): self.sel.get('%s%s' % (self.live_server_url, relative_url)) self.screenshot() def screenshot(self): if hasattr(self, 'sauce_user_name'): # Sauce Labs is taking screenshots for us return if not hasattr(self, 'browser') or self.browser == 'htmlunit': # Can't take screenshots return screenshot_dir = settings.SELENIUM_SCREENSHOT_DIR if not screenshot_dir: return name = "%s_%d.png" % (self._testMethodName, self._screenshot_number) path = os.path.join(screenshot_dir, name) self.sel.get_screenshot_as_file(path) self._screenshot_number += 1 def select_by_text(self, selector, text): select = Select(self.wait_for_element(selector)) select.select_by_visible_text(text) self.screenshot() return select def select_by_value(self, selector, value): select = Select(self.wait_for_element(selector)) select.select_by_value(value) self.screenshot() return select def select_text(self, selector, start=0, end=-1): """ Selects the specified text range of the element matching the provided selector by simulating a mouse down, programmatically selecting the text, and then simulating a mouse up. Doesn't yet work on IE < 9 or iOS. Doesn't support nested markup either. """ if not hasattr(self, 'select_text_template'): template = '' for line in SELECT_TEXT_SOURCE.splitlines(): template += line.strip() self.select_text_template = template script = self.select_text_template % (selector, start, end) self.sel.execute_script(script) self.screenshot() def wait_for_background_color(self, selector, color_string): color = Color.from_string(color_string) correct_color = lambda driver: Color.from_string(driver.find_element_by_css_selector(selector).value_of_css_property("background-color")) == color msg = "The color of '%s' should be %s" % (selector, color_string) Wait(self.sel).until(correct_color, msg) self.screenshot() def wait_for_condition(self, return_statement, msg=None): """Wait until the provided JavaScript expression returns true. Note: for this to work, the expression must include the "return" keyword, not just the expression to be evaluated.""" condition_is_true = lambda driver: driver.execute_script(return_statement) if not msg: msg = '"{}" never became true'.format(return_statement) Wait(self.sel).until(condition_is_true, msg) def wait_for_element(self, selector): element_is_present = lambda driver: driver.find_element_by_css_selector(selector) msg = "An element matching '%s' should be on the page" % selector element = Wait(self.sel).until(element_is_present, msg) self.screenshot() return element def wait_for_text(self, text): text_is_present = lambda driver: text in driver.page_source msg = "The text '%s' should be present on the page" % text Wait(self.sel).until(text_is_present, msg) self.screenshot() def wait_for_xpath(self, xpath): element_is_present = lambda driver: driver.find_element_by_xpath(xpath) msg = "An element matching '%s' should be on the page" % xpath element = Wait(self.sel).until(element_is_present, msg) self.screenshot() return element def wait_until_element_contains(self, selector, text): """ Wait until the specified element contains certain text """ text_contained = lambda driver: text in driver.find_element_by_css_selector(selector).text msg = "'%s' should contain the text '%s'" % (selector, text) Wait(self.sel).until(text_contained, msg) self.screenshot() def wait_until_hidden(self, selector): """ Wait until the element matching the selector is hidden """ element = self.wait_for_element(selector) element_is_hidden = lambda driver: not element.is_displayed() msg = "The element matching '%s' should not be visible" % selector Wait(self.sel).until(element_is_hidden, msg) self.screenshot() return element def wait_until_not_present(self, selector): """ Wait until the element matching the selector is gone from page """ element_is_present = lambda driver: driver.find_element_by_css_selector(selector) msg = "There should not be an element matching '%s'" % selector Wait(self.sel).until_not(element_is_present, msg) self.screenshot() def wait_until_not_visible(self, selector): """ Wait until the element matching the selector is either hidden or removed from the page """ element_is_visible = lambda driver: driver.find_element_by_css_selector(selector).is_displayed() msg = "The element matching '%s' should not be visible" % selector Wait(self.sel).until_not(element_is_visible, msg) self.screenshot() def wait_until_option_added(self, selector, option_text): """ Wait until the specified select option appears; the entire select widget may be replaced in the process """ end_time = time.time() + settings.SELENIUM_TIMEOUT while True: try: select = Select(self.sel.find_element_by_css_selector(selector)) for option in select.options: if option.text == option_text: return option except (NoSuchElementException, StaleElementReferenceException): pass time.sleep(settings.SELENIUM_POLL_FREQUENCY) if time.time() > end_time: break raise TimeoutException("Select option should have been added") def wait_until_option_disabled(self, selector, option_text): """ Wait until the specified select option is disabled; the entire select widget may be replaced in the process """ end_time = time.time() + settings.SELENIUM_TIMEOUT while True: try: select = Select(self.sel.find_element_by_css_selector(selector)) for option in select.options: if option.text == option_text and not option.is_enabled(): return option except (NoSuchElementException, StaleElementReferenceException): pass time.sleep(settings.SELENIUM_POLL_FREQUENCY) if time.time() > end_time: break raise TimeoutException("Select option should have been disabled") def wait_until_property_equals(self, selector, name, value): """ Wait until the specified CSS property of the element matching the provided selector matches the expected value """ value_is_correct = lambda driver: driver.find_element_by_css_selector(selector).value_of_css_property(name) == value msg = "The %s CSS property of '%s' should be %s" % (name, selector, value) Wait(self.sel).until(value_is_correct, msg) self.screenshot() def wait_until_offscreen(self, selector): """ Wait until the element matching the provided selector has been moved offscreen (deliberately, not just scrolled out of view) """ end_time = time.time() + settings.SELENIUM_TIMEOUT while True: try: element = self.sel.find_element_by_css_selector(selector) location = element.location size = element.size if location["y"] + size["height"] <= 0: self.screenshot() return True if location["x"] + size["width"] <= 0: self.screenshot() return True except (NoSuchElementException, StaleElementReferenceException): pass time.sleep(settings.SELENIUM_POLL_FREQUENCY) if time.time() > end_time: break raise TimeoutException("'%s' should be offscreen" % selector) def wait_until_onscreen(self, selector): """ Wait until the element matching the provided selector has been moved into the viewable page """ end_time = time.time() + settings.SELENIUM_TIMEOUT while True: try: element = self.sel.find_element_by_css_selector(selector) location = element.location if location["x"] >= 0 and location["y"] >= 0: self.screenshot() return True except (NoSuchElementException, StaleElementReferenceException): pass time.sleep(settings.SELENIUM_POLL_FREQUENCY) if time.time() > end_time: break raise TimeoutException("'%s' should be offscreen" % selector) def wait_until_property_less_than(self, selector, name, value): """ Wait until the specified CSS property of the element matching the provided selector is less than a certain value. Ignores any non-integer suffixes like 'px'. """ value_is_correct = lambda driver: int(re.match(r'([\d-]+)', driver.find_element_by_css_selector(selector).value_of_css_property(name)).group(1)) < value msg = "The %s CSS property of '%s' should be less than %s" % (name, selector, value) Wait(self.sel).until(value_is_correct, msg) self.screenshot() def wait_until_visible(self, selector): """ Wait until the element matching the selector is visible """ element = self.wait_for_element(selector) element_is_visible = lambda driver: element.is_displayed() msg = "The element matching '%s' should be visible" % selector Wait(self.sel).until(element_is_visible, msg) return element # ~~~~~~~~~~~~~~~~~~~~~~~~~ Sauce Labs support ~~~~~~~~~~~~~~~~~~~~~~~~~~ def sauce_labs_driver(self): """ Configure the Selenium driver to use Sauce Labs """ host = os.getenv("SELENIUM_HOST", "ondemand.saucelabs.com") port = os.getenv("SELENIUM_PORT", "80") executor = "".join(["http://", host, ":", port, '/wd/hub']) platform = os.getenv("SELENIUM_PLATFORM", "Windows 7") version = os.getenv("SELENIUM_VERSION", "") self.sauce_user_name = os.getenv("SAUCE_USER_NAME") self.sauce_api_key = os.getenv("SAUCE_API_KEY") tunnel_id = os.getenv("SAUCE_TUNNEL_ID", "") build_number = os.getenv('BUILD_NUMBER') job_name = os.getenv('JOB_NAME') # http://code.google.com/p/selenium/wiki/DesiredCapabilities # https://saucelabs.com/docs/additional-config#desired-capabilities caps = { 'accessKey': self.sauce_api_key, 'capture-html': True, 'browserName': self.browser, 'javascriptEnabled': True, 'name': self.id(), 'platform': platform, 'username': self.sauce_user_name, 'version': version, } if build_number and job_name: caps['build'] = '{} #{}'.format(job_name, build_number) if tunnel_id: caps['tunnel-identifier'] = tunnel_id if settings.SELENIUM_SAUCE_VERSION: caps['selenium-version'] = settings.SELENIUM_SAUCE_VERSION remote = webdriver.Remote(command_executor=executor, desired_capabilities=caps) # Store the Sauce session ID to output later for Jenkins integration # See https://saucelabs.com/jenkins/5 for details sauce_sessions.append('SauceOnDemandSessionID={} job-name={}'.format(remote.session_id, self.id())) return remote def report_status(self, passed): """Report to Sauce Labs whether or not the test passed, so that can be reflected in their UI.""" if not hasattr(self, 'sauce_user_name'): # Not using Sauce Labs for this test return url_pattern = 'http://{}:{}@saucelabs.com/rest/v1/{}/jobs/{}' url = url_pattern.format(self.sauce_user_name, self.sauce_api_key, self.sauce_user_name, self.sel.session_id) body_content = json.dumps({"passed": passed}) headers = { 'Content-Type': 'application/json', } response = requests.put(url, body_content, headers=headers) return response.status_code == 200
def order(shop=None, browser=None, lego_set=None, order_list=None, username=None, password=None): """ Fill in LEGO parts to be ordered in LEGO's customer service shop. """ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver import Chrome, Firefox from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.select import Select from selenium.webdriver.support.wait import WebDriverWait from time import sleep order_list = order_list.split(',') shop_url = 'https://wwwsecure.us.lego.com/{shop}/service/replacementparts/order'.format(shop=shop) browser = Chrome() if browser == 'chrome' else Firefox() browser.get(shop_url) print("Sometimes they ask you to fill in a survey.") try: survey_layer = browser.find_element_by_id('ipeL104230') survey_layer.send_keys(Keys.ESCAPE) except NoSuchElementException: print("We're lucky, no survey on the LEGO shop today!") print("They want to know how old we are.") age_field = browser.find_element_by_name('rpAgeAndCountryAgeField') age_field.send_keys('55') age_field.send_keys(Keys.RETURN) if username and password: print("Let's log in with LEGO ID {user}.".format(user=username)) login_link = browser.find_element_by_css_selector('.legoid .links > a') login_link.click() browser.switch_to.frame('legoid-iframe') user_input = browser.find_element_by_id('fieldUsername') user_input.click() user_input.send_keys(username) passwd_input = browser.find_element_by_id('fieldPassword') passwd_input.click() passwd_input.send_keys(password) login_button = browser.find_element_by_id('buttonSubmitLogin') login_button.click() browser.switch_to.default_content() sleep(4) # seconds wait = WebDriverWait(browser, 5) print("We need to tell them which set we want to buy parts from: {lego_set}".format(lego_set=lego_set)) setno_field = wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, '.product-search input[ng-model=productNumber]'))) setno_field.send_keys(lego_set) setno_field.send_keys(Keys.RETURN) print("Let's scroll the page down a bit, so we can see things better.") browser.execute_script("window.scroll(0, 750);") print("That's gonna be crazy: {count} elements to order! Let's rock.".format(count=len(order_list))) element_field = wait.until(EC.element_to_be_clickable( (By.ID, 'element-filter'))) print() for brick in order_list: part_no, quantity = brick.split(':') print("- {qty}x #{pn} ".format(qty=quantity, pn=part_no), end='') element_field.clear() element_field.send_keys(part_no) element_field.send_keys(Keys.RETURN) sleep(.3) # seconds try: add_button = browser.find_element_by_css_selector('.element-details + button') add_button.click() sleep(.2) # seconds except NoSuchElementException: print("OOOPS! No LEGO part with that number found in set #{set}. :-(".format(set=lego_set)) continue try: warn_msg = browser.find_element_by_css_selector('.alert-warning .sold-out-info') if warn_msg.is_displayed(): print("NOTE: item out of stock. ", end='') add_anyway = browser.find_element_by_css_selector('.alert-warning + .clearfix button') add_anyway.click() except NoSuchElementException: pass amount_select = browser.find_elements_by_css_selector('.bag-item select')[-1] amount_select.send_keys(quantity) amount_select.send_keys(Keys.TAB) selected = Select(amount_select).first_selected_option if quantity != selected.text: print("WARNING: Could not select desired quantity. {} != {}".format(quantity, selected.text)) else: print() browser.execute_script("window.scroll(0, 0);") print() print("We're done. You can finalize your order now. Thanks for watching!")