def take_screenshots(): driver = Chrome() driver.get('http://127.0.0.1:7000') driver.set_window_size(1260, 800) driver.save_screenshot('main.png') driver.find_element_by_css_selector('.feed-post').click() time.sleep(5) # Wait for MathJax to do its thing. driver.save_screenshot('post.png') driver.close()
def extract_citation_for_publication(link): """ this function craws the list of articles from a given link. If it has next page, it will continue to it until there is none @param[in] profile_url the link of google scholar profile you want to crawl @return the list of articles as a list where each entry is dictionary """ browser=Browser('chromedriver.exe') citation={} # go the citation view # as the page is written is javascript, we are not able to get its content via urllib2 # intead we will use Selenium to simulate a web browser to render the page # req=urllib2.Request(publication[k]['link'], headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}) # p=urllib2.urlopen(req) # sub_soup=BeautifulSoup(p.readlines()[0], 'html.parser') # s=sub_soup.find(id='gs_ccl') browser.get(link) while True: citation_root=browser.find_element_by_id('gs_ccl') citation_list=citation_root.find_elements_by_class_name('gs_r') for citation_item in citation_list: # title title=citation_item.find_element_by_class_name('gs_rt').text # try to get the downloading link, if there is one try: link=citation_item.find_element_by_id('gs_ggsW2') link=link.find_element_by_link_text(link.text).get_attribute('href') except: link=None # author author_line=citation_item.find_element_by_class_name('gs_a') author_name=author_line.text.split(', ') author={} # for each of the author, find its link if its exits for a in author_name: try: print '.', # there is a google scholar profile with author item=author_line.find_element_by_link_text(a) author[a]=item.get_attribute('href') except: # there is not such profile author[a]=None # we can also press the cite button to get the detailed citation information, skipped here citation[title]={'link':link, 'author': author} # go to next page, if there is one if not next_page(browser): break # close browser.close() return citation
def extract_publication(profile_url, verbose=verbose_citation_list): """ this function crawl the publication list from the google scholar profile @param[in] profile_url the link of google scholar profile you want to crawl @param[in] verbose the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton @return the list of pulication as a list, where each entry is a dictionary """ # scholar's artical list browser=Browser() browser.get(profile_url) publication={} while True: publication_list=browser.find_elements_by_class_name('gsc_a_tr') for publication_item in publication_list: title=publication_item.find_element_by_class_name('gsc_a_at').text print title author=publication_item.find_elements_by_class_name('gs_gray')[0].text.split(', ') vendor=publication_item.find_elements_by_class_name('gs_gray')[1].text try: citation=int(publication_item.find_element_by_class_name('gsc_a_ac').text) link=publication_item.find_element_by_class_name('gsc_a_ac').get_attribute('href') except: citation=0 link=None try: year=int(publication_item.find_element_by_class_name('gsc_a_h').text) except: year=None """ # to get citation for every paper, but will be detected as robot if citation>0 and verbose>=verbose_citation_list: print 'and its citation list', # to solve anti-crawl, but not work # time.sleep(2) cited_by=extract_citation_for_publication(link) else: cited_by=None print 'finished' publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': cited_by, 'year':year} """ publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': citation, 'year':year} if not next_page_new(browser): break browser.close() return publication
def extract_movies(max_page_num=5): browser = Browser() browser.get(URL) movies = {} while True: movie_list = browser.find_elements_by_class_name('item') for movie in movie_list: title = movie.find_element_by_tag_name("p").text.strip() rating = movie.find_element_by_tag_name("strong").text.strip() movies[title] = rating if max_page_num > 0: max_page_num -= 1 if not have_more(browser): break else: break browser.close() return movies
def extract_movies(max_page_num=5): browser = Browser() browser.get(URL) movies = {} while True: movie_list = browser.find_elements_by_class_name("item") for movie in movie_list: title = movie.find_element_by_tag_name("p").text.strip() rating = movie.find_element_by_tag_name("strong").text.strip() movies[title] = rating if max_page_num > 0: max_page_num -= 1 if not have_more(browser): break else: break browser.close() return movies
def __init__(self): opts = Options() # opts.add_argument("--user-data-dir=~/.config/google-chrome") opts.add_argument("start-maximized") opts.set_headless() browser = Chrome(chrome_options=opts) browser.get(self.URL) try: play_button = WebDriverWait(browser, 10).until( expected_conditions.element_to_be_clickable( (By.CLASS_NAME, self.PLAY_BUTTON_CLASS))) # Play button might not be in view, have to move to it to click it ActionChains(browser).move_to_element(play_button).perform() play_button.click() self.__load_game_data(browser.page_source) finally: browser.close()
def scrape_company_data(companies): """Scrape data from Glassdoor on companies in list.""" browser = Chrome() url = "https://www.glassdoor.com/Reviews/index.htm" browser.get(url) final_data = [] sleep(23) for company in companies: try: sel = "input#KeywordSearch.keyword" search_bar = browser.find_element_by_css_selector(sel) search_bar.send_keys(company) search_bar.send_keys(Keys.ENTER) sleep(4) if len(browser.window_handles) > 1: tab_1, tab_2 = browser.window_handles browser.switch_to.window(tab_1) browser.close() browser.switch_to.window(tab_2) sel = "a.tightAll.h2" company_link = browser.find_element_by_css_selector(sel) company_link.click() sleep(4) sel = "a.eiCell.cell.reviews" reviews_link = browser.find_element_by_css_selector(sel) reviews_link.click() sleep(4) temp_stats = [company] temp_stats.append(get_stats(browser)) final_data.append(temp_stats) browser.get(url) except Exception as e: print(e) url = "https://www.glassdoor.com/Reviews/index.htm" browser.get(url) return pd.DataFrame(final_data)
def additionalCrawl3(): # 增加爬取各省份信息公开数据 target = 'http://www.csrc.gov.cn/pub/zjhpublic/' driver = Chrome(executable_path="/usr/local/bin/chromedriver", options=CHROME_OPS) driver.get(target) page_source = driver.page_source bs = BeautifulSoup( page_source, 'lxml', ) tag_a = bs.find('center').find_all('a') all_province_link = [] for a in tag_a: href = 'http://www.csrc.gov.cn' + a.get('href') all_province_link.append(href) for link in all_province_link: driver.get(link) driver.switch_to.frame("DataList") page_source = driver.page_source bs = BeautifulSoup( page_source, 'lxml', ) all_row = bs.find_all('div', class_="row") for row in all_row: res_map = {} href = row.find('a').get('href') title = row.find('a').get_text() date = row.find('li', class_="fbrq").get('title') res_map["title"] = title res_map["date"] = format_date(date) res_map[ "url"] = 'http://www.csrc.gov.cn/pub/zjhpublic/' + href.replace( '../', '') TARGET_LISTS.append(res_map) driver.close()
def Scrape_Harvard(): #Set up MongoDB client db_client = MongoClient( 'mongodb+srv://jonesca7:[email protected]/test?retryWrites=true&w=majority' ) db = db_client.CourseList #Create database collection = db.collection #Create collection #Set up Chrome driver for web browsing webdriver = "chromedriver.exe" driver = Chrome(webdriver) url = "https://online-learning.harvard.edu/catalog/free" driver.get(url) pages = driver.find_elements_by_xpath( "//ul[@class='pager']/li[@class='pager-item']") num_pages = len(pages) course_list = [] for page in range(num_pages + 1): url = "https://online-learning.harvard.edu/catalog/Free?page=" + str( page) driver.get(url) courses = driver.find_elements_by_xpath( "//ul[@class='course-grid']/li") for course in courses: course_title = course.find_element_by_class_name( "field-name-title-qs").text course_topic = course.find_element_by_class_name( "field-name-subject-area").text course_url = course.find_element_by_xpath( "div/div/div/h3/a").get_attribute("href") course_object = { "name": course_title, "topic": course_topic, "platform": "Harvard", "url": course_url } course_list.append(course_object) collection.insert_many(course_list) driver.close()
def testDisplayEmailIn(system): driver = Chrome("chromedriver.exe") driver.get(index_url) driver.find_element_by_id('studentSelection').click() time.sleep(1) driver.find_element_by_id("email").send_keys(student_email) driver.find_element_by_id("password").send_keys(student_password) driver.find_element_by_id(system).click() driver.find_element_by_id("signInButton").click() time.sleep(4) alert = driver.switch_to.alert alert.accept() time.sleep(4) alert = driver.switch_to.alert assert re.match(r"You have \d unread emails?.", alert.text), "wrong numUnread alert message" alert.accept() assert re.match(r"NUMBER OF UNREAD EMAILS: \d", driver.find_element_by_class_name( "numUnread").get_attribute("innerHTML")), "wrong numUnread message" email = driver.find_element_by_id("email0") assert email != None, "email0 not present" # check emails class names # we assume email0 is unread, email1 is unreadUrgent, email2 is read, email3 is readUrgent # unread email0 assert driver.find_element_by_id("email0").get_attribute( "class") == "emailRow unreadRow", "wrong email className unread email" assert driver.find_element_by_id("emailTwoButtons0").get_attribute( "class") == "twoButtons unreadTwoButtons", "wrong className unread two buttons" # unreadUrgent email1 assert driver.find_element_by_id("email1").get_attribute( "class") == "emailRow unreadUrgentRow", "wrong email className unreadUrgent email" assert driver.find_element_by_id("emailTwoButtons1").get_attribute( "class") == "twoButtons unreadUrgentTwoButtons", "wrong className unreadUrgent two buttons" # read email2 assert driver.find_element_by_id("email2").get_attribute( "class") == "emailRow readRow", "wrong email className read email" assert driver.find_element_by_id("emailTwoButtons2").get_attribute( "class") == "twoButtons readTwoButtons", "wrong className read two buttons" # readUrgent email3 assert driver.find_element_by_id("email3").get_attribute( "class") == "emailRow readUrgentRow", "wrong email className readUrgent email" assert driver.find_element_by_id("emailTwoButtons3").get_attribute( "class") == "twoButtons readUrgentTwoButtons", "wrong className readUrgent two buttons" driver.close()
def iterate_through_results(driver: webdriver.Chrome) -> pd.DataFrame: """ Go through a given page's senators. """ col_names = [ "tx_date", "file_date", "last_name", "first_name", "order_type", "ticker", "asset_name", "tx_amount", ] all_txs = pd.DataFrame().rename(columns=dict(enumerate(col_names))) no_rows = 0 n_links = 0 for row in driver.find_elements_by_tag_name("tbody")[0].find_elements_by_tag_name( "tr" ): cols = row.find_elements_by_tag_name("td") first, last, report_type, date_received = ( cols[0].get_attribute(INNER_TEXT), cols[1].get_attribute(INNER_TEXT), cols[3], cols[4].get_attribute(INNER_TEXT), ) link = report_type.find_elements_by_tag_name("a")[0] click_on(driver, link) driver.switch_to.window(driver.window_handles[-1]) txs = parse_page(driver) if len(txs) == 0: no_rows += 1 driver.close() driver.switch_to.window(driver.window_handles[-1]) all_txs = all_txs.append( txs.assign(file_date=date_received, last_name=last, first_name=first) ) time.sleep(2) n_links += 1 LOGGER.info( "{} out of {} pages returned no extractable transaction data".format( no_rows, n_links ) ) return all_txs
def scrape_nba(url): tempstring = url_entry.find("Season") tempstring = url_entry[tempstring:tempstring + 14] tempstring = tempstring.replace('=', '') if url.find("team") > -1: tempstring = "C:/local/nba_stats/teams_" + tempstring.replace( '-', '_') + ".csv" else: tempstring = "C:/local/nba_stats/" + tempstring.replace('-', '_') + ".csv" if os.path.exists(tempstring): return driver = Chrome( executable_path='C:/local/chromedriver_win32/chromedriver.exe') driver.get(url) driver.find_element_by_class_name('run-it').click() done = False time.sleep(10) more_results = driver.find_element_by_class_name('table-addrows__button') while not done: try: for x in range(32000): more_results.click() time.sleep(0.1) break except selenium.common.exceptions.StaleElementReferenceException: done = True tabletest = driver.find_element_by_class_name('nba-stat-table').text linecount = 0 with open(tempstring, 'w') as csvfile: for line in tabletest.splitlines(): tempstring = line.replace(' ', ',') tempstring = tempstring.replace('PLAYER', 'ID,FIRST,LAST') tempstring = tempstring.replace('MATCHUP', 'PLAYER TEAM,HOME,OPPONENT') if linecount != 0: tempstring = str(linecount) + "," + tempstring csvfile.writelines(tempstring + "\n") linecount += 1 driver.close()
def recent_post_links(chrome_path, username, post_count=10): """ With the input of an account page, scrape the 10 most recent posts urls Args: username: Instagram username post_count: default of 10, set as many or as few as you want Returns: A list with the unique url links for the most recent posts for the provided user """ print("User " + username + " started:") start_time = time.time() url = "https://www.instagram.com/" + username + "/" options = Options() options.add_argument("--headless") # options.add_argument('--no-sandbox') options.add_argument("--disable-gpu") options.add_argument("--remote-debugging-port=9222") browser = Chrome(options=options, executable_path=chrome_path) browser.get(url) post = "https://www.instagram.com/p/" post_links = [] while len(post_links) < post_count: links = [ a.get_attribute("href") for a in browser.find_elements_by_tag_name("a") ] for link in links: if post in link and link not in post_links: post_links.append(link) print("\tPost " + str(len(post_links)) + " Processed") time_elaps = time.time() - start_time if time_elaps > (post_count / 12 * 20): print("Time out on reading in post details, some posts skipped") browser.close() return post_links[:post_count] scroll_down = "window.scrollTo(0, document.body.scrollHeight);" browser.execute_script(scroll_down) time.sleep(3) else: # browser.stop_client() browser.close() # os.system("taskkill /f /im chromedriver.exe /T") return post_links[:post_count]
def get_links(start_url): mfa_links = get_json_from_file('mfa_links.json') driver = Chrome(executable_path="C://Users//User/chromedriver.exe") # open page 1 and count pages driver.get(start_url + '1') time.sleep(5) try: num_pages = int( driver.find_elements_by_css_selector('div.paginates > ul > li') [-2].text) except IndexError: driver.get(start_url + '1') time.sleep(10) num_pages = int( driver.find_elements_by_css_selector('div.paginates > ul > li') [-2].text) # generate pages urls list pages = [start_url + str(i) for i in range(1, num_pages + 1)] # get links to texts from every page all_links = [] n = 0 for page in pages: print('Working with page', n, 'out of', num_pages) links = [] while len(links) == 0: driver.get(page) time.sleep(3) links = [ link.get_attribute('href') for link in driver.find_elements_by_css_selector('a.anons-title') ] print('Found', len(links), 'links on this page') all_links.extend(links) n += 1 # save scraped data to file category = re.compile('/(\w+)\?').findall(start_url)[0] mfa_links[category] = all_links update_json(mfa_links, 'mfa_links.json') driver.close()
def run(self): chrome_options = Options() chrome_options.add_argument('--headless') # 使用无头浏览器,不跳出窗口 driver = Chrome(chrome_options=chrome_options) driver.get( 'https://newids.seu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.seu.edu.cn%2Fqljfwapp2%2Fsys%2FlwReportEpidemicSeu%2Findex.do%3Ft_s%3D1594447890898%26amp_sec_version_%3D1%26gid_%3DSTZiVXZjRnhVSS9VNERWaFNNT1hXb2VNY3FHTHFVVHMwRC9jdTdhUlllcXVkZDNrKzNEV1ZxeHVwSEloRVQ4NHZFVzRDdHRTVlZ1dEIvczVvdzVpVGc9PQ%26EMAP_LANG%3Dzh%26THEME%3Dindigo%23%2FdailyReport' ) driver.maximize_window() driver.find_element_by_id('username').send_keys( self.cfg.user_id) # 一卡通号 driver.find_element_by_id('password').send_keys( self.cfg.password) # 密码 driver.find_element_by_xpath( '//*[@class="auth_login_btn primary full_width"]').click() status = "打卡失败" try: WebDriverWait(driver, 30, 0.2).until(lambda x: x.find_element_by_xpath( '//*[@class="bh-btn bh-btn-primary"]')) driver.find_element_by_xpath( '//*[@class="bh-btn bh-btn-primary"]').click() WebDriverWait( driver, 30, 0.2).until(lambda x: x.find_element_by_name('DZ_JSDTCJTW')) driver.find_element_by_name('DZ_JSDTCJTW').send_keys('36.5') driver.find_element_by_id('save').click() WebDriverWait( driver, 30, 0.2 ).until(lambda x: x.find_element_by_xpath( '//*[@class="bh-dialog-btn bh-bg-primary bh-color-primary-5"]') ) driver.find_element_by_xpath( '//*[@class="bh-dialog-btn bh-bg-primary bh-color-primary-5"]' ).click() status = "打卡成功" except: pass self.send_email(status) driver.close()
class OpenBrowser: def __init__(self, flag): self.flag = flag def __enter__(self): if self.flag == 1: opt = FirefoxOptions() opt.set_headless() self.browser = Firefox(options=opt) return Firefox(options=opt) else: opt = ChromeOptions() opt.headless = True self.browser = Chrome(options=opt) return Chrome(options=opt) def __exit__(self): self.browser.close()
def quick_win(parametr): driver = Chrome() driver.get("https://kirsorokin.github.io/tictactoe-angular-1.5/") table = driver.find_element_by_class_name("field") table_rows = table.find_elements_by_tag_name('tr') count_table_rows = len(table_rows) headers = table_rows[0] cells = headers.find_elements_by_tag_name('td') count_cells = 0 for item in cells: count_cells += 1 first_cell = random.randint(1, count_cells) first_row = random.randint(1, count_table_rows) end_cell = first_cell end_row = first_row if parametr == "row": for item in range(1, 6): if first_row > count_table_rows: end_row -= 1 table.find_element_by_xpath('.//tbody/tr[%s]/td[%s]' % (end_row, first_cell)).click() else: table.find_element_by_xpath('.//tbody/tr[%s]/td[%s]' % (first_row, first_cell)).click() first_row +=1 elif parametr == "cell": for item in range(1, 6): if first_cell > count_cells: end_cell -= 1 table.find_element_by_xpath('.//tbody/tr[%s]/td[%s]' % (first_row, end_cell)).click() else: table.find_element_by_xpath('.//tbody/tr[%s]/td[%s]' % (first_row, first_cell)).click() first_cell +=1 else: print "no parametr" return 0 try: driver.switch_to_alert().accept() except Exception as e: print e return 0 table = driver.find_element_by_class_name("field") assert(find_o(table.text)==1) driver.close() return 1
def get_baidu_hot(): option = ChromeOptions() option.add_argument('--headless') # 隐藏浏览器 option.add_argument('--no--sandbox') browser = Chrome(options=option, executable_path="chromedriver.exe") url = "https://voice.baidu.com/act/virussearch/virussearch?from=osari_map&tab=0&infomore=1" browser.get(url) but = browser.find_element_by_css_selector( '#ptab-0 > div > div.VirusHot_1-5-6_32AY4F.VirusHot_1-5-6_2RnRvg > section > div' ) # 点击加载更多按钮 but.click() time.sleep(1) c = browser.find_elements_by_xpath( '//*[@id="ptab-0"]/div/div[1]/section/a/div/span[2]') print(len(c)) context = [i.text for i in c] browser.close() return context
def getitem(name): ts = time() opts = Options() opts.binary_location = os.environ.get("GOOGLE_CHROME_BIN") opts.add_argument("--headless") opts.add_argument("--disable-dev-shm-usage") opts.add_argument("--no-sandbox") driver = Chrome(ChromeDriverManager().install(), options=opts) driver.get("https://www.bing.com/") WebDriverWait(driver, 25).until( EC.presence_of_element_located( (By.XPATH, r"/html/body/div[3]/div[2]/div[2]/form/input[1]"))) search = driver.find_element_by_xpath( r"/html/body/div[3]/div[2]/div[2]/form/input[1]") search.send_keys(name + " meme") search.submit() WebDriverWait(driver, 25).until( EC.presence_of_element_located( (By.XPATH, r"/html/body/header/nav/ul/li[2]/a"))) driver.find_element_by_xpath(r"/html/body/header/nav/ul/li[2]/a").click() sleep(2) h = driver.execute_script("return document.body.scrollHeight") while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") sleep(2) nh = driver.execute_script("return document.body.scrollHeight") if nh == h: break h = nh WebDriverWait(driver, 25).until( EC.presence_of_element_located(( By.XPATH, r"/html/body/div[3]/div[5]/div[3]/div[1]/ul[1]/li[1]/div/div/a/div/img" ))) content = driver.find_elements_by_class_name("mimg") image = choice(content).get_attribute("src") driver.close() return [image, str(time() - ts) + " s"]
def main(): option = ChromeOptions() # option.add_argument("--headless") # 隐藏浏览器 option.add_argument("--no-sandbox") # linux系统下禁用sandbox browser = Chrome(options=option) # 自动打开Chrome浏览器 x = 0 # 用于记录下载的图片张数 start = time.time() # 程序开始时间 url = "https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js" json_flie = getJSONtext(url) for m in range(len(json_flie['hero'])): heroId = json_flie['hero'][m]['heroId'] # 编号 name = json_flie['hero'][m]['name'] # 英雄名字 hero_dir = create_folder(name) new_url = "https://lol.qq.com/data/info-defail.shtml?id=" + str(heroId) # print(new_url) browser.get(new_url) time.sleep(1) # 等待1秒 button = browser.find_element_by_xpath( '//*[@id="skinNAV"]/li[2]/a/img') button.click() time.sleep(1) # 等待1秒 img = browser.find_elements_by_xpath('//*[@id="skinBG"]/li/img') name = browser.find_elements_by_xpath('//*[@id="skinBG"]/li') for i in range(len(name)): # print(img[i].get_attribute("src")) # print(name[i].get_attribute("title")) try: picture = requests.get( img[i].get_attribute("src")).content # 获取图片的二进制信息 with open( hero_dir + str(name[i].get_attribute("title")) + '.jpg', 'wb') as f: # 保存图片 f.write(picture) x = x + 1 print("正在下载....第" + str(x) + "张") except: pass time.sleep(2) # 等待1秒 browser.close() end = time.time() # 程序结束时间 time_second = end - start # 执行时间 print("共下载" + str(x) + "张,共耗时" + str(time_second) + "秒")
def browser(config_browser, config_wait_time): # Initialize WebDriver if config_browser == 'chrome': driver = Chrome() elif config_browser == 'firefox': driver = Firefox() else: raise Exception('"{config_browser}" is not a supported browser') # Wait implicitly for elements to be ready before attempting interactions driver.implicitly_wait(config_wait_time) driver.maximize_window() # Return the driver object at the end of setup yield driver # For cleanup, quit the driver driver.close() driver.quit()
def pxi_Rtm_import(): # set download directory path p = { 'download.default_directory': r'C:\Users\dheer\Desktop\wrldc\RTM_BROWSER_AUTOMATION\Dumps\pxiRtmFile' } #add options to browser opts.add_experimental_option('prefs', p) browser = Chrome(options=opts) #maximize browser browser.maximize_window() # open the website "https://www.powerexindia.com/code/frontend/Reports/RTM/MarketVolumeProfileReport.html/" browser.get( 'https://www.powerexindia.com/code/frontend/Reports/RTM/MarketVolumeProfileReport.html/' ) # click on the datepicker select button previousDate = dt.datetime.today() - dt.timedelta(days=1) previousDateFormatted = previousDate.strftime( '%d-%m-%Y') # format the date to ddmmyyyy # provide previous date browser.find_elements_by_name("DeliveryfromDate")[0].send_keys( previousDateFormatted) # provide previous date to "DeliverytoDate" browser.find_elements_by_name("DeliverytoDate")[0].send_keys( previousDateFormatted) button = browser.find_elements_by_name('submit') button[0].click() # click on different download csv option button class = "dt-button buttons-csv buttons-html5" # csvDwnLd = browser.find_element_by_css_selector("dt-button buttons-csv buttons-html5") csvDwnLd = browser.find_elements_by_tag_name("span") srcFileLocation = r'C:\Users\dheer\Desktop\wrldc\RTM_BROWSER_AUTOMATION\Dumps\pxiRtmFile' destFileLocation = r'C:\Users\dheer\Desktop\wrldc\RTM_BROWSER_AUTOMATION\Dumps\pxiRtmFile\Archives' destFileName = "DASMVPReport_" moveFilesToArchive(srcFileLocation, destFileLocation, destFileName) csvDwnLd[11].click() print("pxi rtm fetch succesful") time.sleep(10) browser.close()
def map_zc_to_rep(start_end_tuple): """Map Zip Codes to U.S. Representatives""" print(f"Process {os.getpid()}: ***map_zc_to_rep***") start = start_end_tuple[0] end = start_end_tuple[1] shared_state_zipcode_data_map = start_end_tuple[2] sns = STATE_NAMES[start:end] browser = Chrome(executable_path=CHROME_DRIVER_PATH) print(f"Process {os.getpid()}: States to evaluate: {sns}") for state in sns: print(f"Process {os.getpid()}: State: {state}") if len(shared_state_zipcode_data_map[state]) != 0: for index, zip_code_city_pair in enumerate( shared_state_zipcode_data_map[state]): zip_code = zip_code_city_pair[0] browser.get(REP_URL) sleep(2.5) find_rep_input_field = browser.find_elements_by_css_selector( '#Find_Rep_by_Zipcode') find_rep_input_field[0].send_keys(zip_code) find_rep_button = browser.find_elements_by_css_selector( '.btn-success') find_rep_button[0].click() sleep(2.5) rep_page_anchor_tags = browser.find_elements_by_css_selector( '.rep > a') reps = "" for anchor_tag in rep_page_anchor_tags: if anchor_tag.text == '': continue print( f"Process {os.getpid()} Representative: {anchor_tag.text}" ) reps += anchor_tag.text + ", " # Remove when not debugging # break zip_code_city_pair.append(reps) shared_state_zipcode_data_map[state][ index] = zip_code_city_pair # Remove when not debugging # break print("DONE") browser.close()
def main(): webdriver = os.path.join(r"drive", "chromedriver") driver = Chrome(webdriver) url = "https://www.waytostay.com/paris-apartments/" driver.get(url) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") info = driver.find_elements_by_class_name("tile") prices = [] details = [] for j in range(len(info)): prices.append( driver.find_elements_by_class_name('price-person')[j].text) details.append( driver.find_elements_by_class_name('concise-details')[j].text) print(prices, details) driver.close() driver.quit()
class TestNavigation(StaticLiveServerTestCase): """ test Navigation from homepage """ def setUp(self): self.browser = Chrome() self.browser.implicitly_wait(10) def tearDown(self): self.browser.close() def test_bad_address_returns_handler404(self): """ Test bad address is caught by the handler and redirect to error page """ print(inspect.currentframe().f_code.co_name) response = self.browser.get('%s%s' % (self.live_server_url, '/test')) # message = self.browser.find_element_by_tag_name('h1').text self.assertTemplateUsed(response, 'errors/errors.html') def test_click_mentions(self): """ Test the click on mentions redirect to mentions page """ print(inspect.currentframe().f_code.co_name) self.browser.get(self.live_server_url) user_url = self.live_server_url + reverse('home_app:mentions') element = self.browser.find_element_by_partial_link_text('mentions') actions = ActionChains(self.browser) actions.move_to_element(element) actions.click(element) actions.perform() self.assertEquals(self.browser.current_url, user_url) def test_click_icon_person_to_user(self): """ Test click on the person image redirect to user page """ print(inspect.currentframe().f_code.co_name) self.browser.get(self.live_server_url) user_url = self.live_server_url + reverse('user_app:user') self.browser.find_element(By.CSS_SELECTOR, ".nav-item img").click() self.assertEquals(self.browser.current_url, user_url)
def get_md_event(conn, cur): company = "mcdonalds" driver = Chrome() driver.get(URL_2) xpath_front = "//*[@id=\"promotionList\"]/li[" xpath_rear = "]" for i in range(1, 6): driver.implicitly_wait(100) xpath = xpath_front + str(i) + xpath_rear event = driver.find_element_by_xpath(xpath) image_loc = driver.find_element_by_xpath( "//*[@id=\"promotionList\"]/li[" + str(i) + "]/a/div[1]/img") image = image_loc.get_attribute('src') event.click() time.sleep(2) title_loc = driver.find_element_by_xpath( "//*[@id=\"container\"]/div[1]/div[1]/div[2]/div/div/div[1]/h2") title = title_loc.text title = title.replace('\n', '') date_loc = driver.find_element_by_xpath( "//*[@id=\"container\"]/div[1]/div[1]/div[2]/div/div/div[1]/span/em[1]" ) date = date_loc.text date = date.replace('\n', '') date = date.replace('등록일 :', '') content_loc = driver.find_element_by_xpath( "//*[@id=\"container\"]/div[1]/div[1]/div[2]/div/div/article/div[1]/img" ) content = content_loc.get_attribute('src') if insert_event_list(conn, cur, company, date, image, title, content): message = title + ' inserted in db' + '\n' print(message) driver.back() driver.close()
def update_page_source(self): url = self.build_search_url() driver = Chrome() driver.get(url) num_scrolls = 0 try: while num_scrolls < self.scroll_max: driver.execute_script(random_js_scroll()) self.page_source = driver.page_source random_sleep() num_scrolls += 1 except Exception as e: l.WARN(e) driver.close()
def parse_page(url: str, driver: webdriver.Chrome) -> None: print('Parsing ', url) try: driver.get(url) except TimeoutException: # Reload Browser in case of unreachable page driver.close() driver = create_driver() page_number = int(url[-3::]) next_url = url.replace(str(page_number), str(page_number + 1)) parse_page(next_url, driver) comment_id_elements = driver.find_elements_by_xpath( "//*[contains(@id,'Comment_')]") comment_ids = [id.get_attribute('id') for id in comment_id_elements] for comment_id in comment_ids: user_id_element = driver.find_element_by_xpath( f'//*[@id="{comment_id}"]/div/div[2]/div[1]/span[1]/a[2]') time_element = driver.find_element_by_xpath( f'//*[@id="{comment_id}"]/div/div[2]/div[2]/span[1]/a/time') user_message = driver.find_element_by_xpath( f'//*[@id="{comment_id}"]/div/div[3]/div/div[1]') user_id = user_id_element.text time = time_element.get_attribute('title') comment = user_message.text mongo.insert(user_id, time, comment) try: next_page_element = driver.find_element_by_xpath( '//*[@id="PagerBefore"]/a[contains(@class, "Next Pager-nav")]') next_page_url = next_page_element.get_attribute('href') parse_page(next_page_url, driver) except NoSuchElementException: # In case of reaching the last page driver.close() sys.exit(0)
def download_song(search_song,format='mp3_l',download=False): search_url = 'https://y.qq.com/portal/search.html#page=1&searchid=1&remoteplace=txt.yqq.top&t=song&w={}'.format( search_song) search_download_url_url = 'http://www.douqq.com/qqmusic/qqapi.php' chrome_options = ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = Chrome('files/server/driver/chromedriver.exe', options=chrome_options) driver.get(search_url) driver.implicitly_wait(5) try: element = driver.find_element_by_xpath('//*[@id="song_box"]/div[2]/ul[2]/li[1]/div/div[2]/span/a') input_url = element.get_attribute('href') except: return False, '歌曲名未查询到!' finally: driver.close() try : info_dict = loads(loads(post(search_download_url_url, data={'mid': input_url}).text).replace('\/', '/')) except: return False, '破解大法失败,大侠还是购买正版吧!' if 'mp3' in format: file_name = '.'.join([info_dict['songname'], 'mp3']) else: file_name = '.'.join([info_dict['songname'], format]) download_url = info_dict[format] print('download_url',download_url) if download: FILE_PATH = 'files/music/' if not exists(FILE_PATH): mkdir(FILE_PATH) try : urlretrieve(download_url, FILE_PATH+file_name) except : return False,'似乎无权作弊,你可以试试选择别的格式哟!' return True, FILE_PATH+file_name else : return True, download_url
def askUserDoesHeWant(offer_url, is_authenticated): print("If you want it, press Y, else N. For offer description I. For link L. To open it in your browser press O") user_key = readchar.readkey() if user_key == "y" or user_key == 'Y': if is_authenticated == 'Not logged in': print("You cannot send messages when you aren't logged in") print("Press B to come back to main tab and log in") user_key = readchar.readkey() if user_key == "B" or "b": mainTab(is_authenticated) askUserDoesHeWant(offer_url, is_authenticated) else: print("Input: Yes") offer_database = open("offerDatabase.txt", "a") offer_database.write(offer_url) offer_database.close() sendMessage(offer_url) elif user_key == 'n' or user_key == 'N': print("Input: No") offer_database = open("offerDatabase.txt", "a") offer_database.write(offer_url) offer_database.close() elif user_key == 'i' or user_key == 'I': print("------------------------------") print(additionalOfferInfo(offer_url)) askUserDoesHeWant(offer_url, is_authenticated) elif user_key == 'l' or user_key == 'L': print(offer_url) askUserDoesHeWant(offer_url, is_authenticated) elif user_key == "o" or user_key == "O": additional_browser = Chrome() additional_browser.get(offer_url) print("Press enter when you're done") input() additional_browser.close() askUserDoesHeWant(offer_url, is_authenticated) elif user_key == readchar.key.CTRL_C: print("Bye") mainBrowser.exit() exit() else: askUserDoesHeWant(offer_url, is_authenticated)
class InstaBot: BASE_URL = "https://www.instagram.com/" def __init__(self): self.driver = Chrome() self.wait = WebDriverWait(self.driver, 15) self.driver.maximize_window() self.driver.implicitly_wait(5) self.driver.get(self.BASE_URL) def login(self, username, password): USERNAME_FIELD = "//input[@type='text']" PASSWORD_FIELD = "//input[@type='password']" self.driver.find_element_by_xpath(USERNAME_FIELD).send_keys(username) self.driver.find_element_by_xpath(PASSWORD_FIELD).send_keys(password) self.driver.find_element_by_xpath(PASSWORD_FIELD).send_keys(Keys.ENTER) self.wait.until(EC.url_contains("accounts/")) def follow(self, target, quantity=10): target = target.strip().lower() account_url = self.BASE_URL + f"{target}" + '/' self.driver.get(account_url) self.wait.until(EC.title_contains(target)) FOLLOWING_LINK = f"//a[@href='/{target}/following/']" self.driver.find_element_by_xpath(FOLLOWING_LINK).click() FOLLOW_BTNS = "//button[text()='Follow']" follow_btns = self.driver.find_elements_by_xpath(FOLLOW_BTNS) if len(follow_btns) == 0: popup = self.driver.find_element_by_class_name('isgrP') self.driver.execute_script( 'arguments[0].scrollTop = arguments[0].scrollHeight', popup) follow_btns = self.driver.find_elements_by_xpath(FOLLOW_BTNS) for btn in follow_btns[:quantity + 1]: btn.click() sleep(1) def stop(self): self.driver.close()
def utm(): driver = Chrome() driver.implicitly_wait(5) driver.set_page_load_timeout(5) driver.get( "https://39.134.87.216:31943/pm/themes/default/pm/app/i2000_monitorView_pm.html?curMenuId=com.iemp.app.pm.monitorView&_=1545967221368#group_152734715982719" ) # print(driver.page_source) usr = driver.find_element_by_xpath("//*[@id=\"username\"]") usr.send_keys("admin") pw = driver.find_element_by_xpath("//*[@id=\"password\"]") pw.send_keys("HuaWei12#$") input('Press Enter to continue...') # captcha = driver.find_element_by_xpath("//*[@id=\"validate\"]") # vc = input('输入网页上的验证码') # captcha.send_keys(vc) # captcha.send_keys(Keys.RETURN) # time.sleep(1) # action = ActionChains(driver) # action.send_keys(Keys.ESCAPE) # print(2) # try: # action.perform() # except TimeoutException: # print('time out') # # action.perform() # print(3) # button = driver.find_element_by_css_selector('#treeDiv_1_switch') # button.click() # print(driver.get_cookies()) cookie = '' for item in driver.get_cookies(): # print(item) if item['name'] == 'JSESSIONID': cookie = 'JSESSIONID=' + item['value'] # print(cookie) driver.close() driver.quit() return cookie
def get_search_result(url, data): opt = ChromeOptions() opt.headless = True browser = Chrome(options=opt) #指定浏览器 '''除了chrome以外的浏览器对于开发没有卵用 browser.get(url) time1 = random.uniform(1, 2) time.sleep(time1) #模拟休眠时间:秒 browser.find_element_by_id('kw').send_keys(data) #模拟输入 #time.sleep(random.uniform(1, 2)) #这里似乎不需要停顿 browser.find_element_by_id('su').click() #模拟点击 time2 = random.uniform(2, 5) time.sleep(time2) #html = browser.find_element_by_xpath("html").text 这一句很神奇可以直接拿到文本 html = browser.execute_script( "return document.documentElement.outerHTML") #这句话可以拿到html的源码 time3 = random.uniform(1, 3) time.sleep(time3) if len(data) >= 10: view_len = 10 else: view_len = len(data) print('查询内容:' + data[:view_len] + '\n' '模拟载入时间:' + str(round(time1, 2)) + 's\n' '模拟搜索时间:' + str(round(time2, 2)) + 's\n' '模拟观看时间:' + str(round(time3, 2)) + 's\n') this_url = browser.current_url browser.close() reg = r'<div class="c-abstract">(.*?)</div><div class="f13">' ##百度html的开头: - </span> \\\\结尾:</div><div class= result_first = [i for i in re.findall(reg, html) if i != ''] result_final = [] for i in result_first: reg = r'<span(.*?)</span>' try: del_text = re.findall(reg, i)[0] text_new = i.replace('<span', '').replace('</span>', '').replace(del_text, '') except: text_new = i.replace('<span', '').replace('</span>', '') result_final.append(text_new) result = result_final + [this_url] + [data] return result
class TestSample(unittest.TestCase): def setUp(self): self.driver = Chrome("C://chromedriver.exe") self.driver.maximize_window() self.driver.implicitly_wait(40) self.driver.get("https://demo.actitime.com/") self.login = LoginPage(self.driver) self.home = HomePage(self.driver) self.user = UserPage(self.driver) def tearDown(self): self.driver.close() def test_invalid_login_TC13121(self): Data = json.load(open("./test/regression/login/UserStory123.json")) self.login.wait_for_login_page_to_load() self.login.get_username_textbox().send_keys( Data['TC12345']['Username1']) self.login.get_password_textbox().send_keys( Data['TC12345']['Password1']) self.login.get_login_button().click() actual_error_msg = self.login.get_login_error_msg().text expected_error_msg = "Username or Password is invalid. Please try again." assert actual_error_msg == expected_error_msg, "Its invalid" def test_Add_User(self): Data = json.load(open("./test/regression/login/UserStory123.json")) self.login.wait_for_login_page_to_load() self.login.get_username_textbox().send_keys( Data['TC12345']['Username']) self.login.get_password_textbox().send_keys( Data['TC12345']['Password']) self.login.get_login_button().click() self.home.get_users_button().click() self.user.get_add_user_button().click() self.user.wait_for_add_user_to_load() self.user.get_first_name_textbox().send_keys("Kushal") self.user.get_last_name_textbox().send_keys("R") self.user.get_email_textbox().send_keys("*****@*****.**") self.user.get_dropdown_list().click() self.user.get_department_dropdown().click() self.user.get_save_send_invitation_button().click()
def extract_hongren(max_page_num=5): suffix = "hongren" # 正常情况下,把driver文件所在路径加到Path环境变量里就可以了 # 但是我这里不知道怎么回事就是不行,干脆放在代码所在目录下面了 browser = Browser('chromedriver.exe') browser.get(BASE_URL + suffix) items = {} while True: item_list = browser.find_elements_by_class_name('wall_item') for item in item_list: href = item.find_element(By.CSS_SELECTOR, ".pic_box.pic").get_attribute("href") desc = item.find_elements_by_class_name("desc")[0].text.strip() items[href] = desc if max_page_num > 0: max_page_num -= 1 if not scroll_to_next(browser): break else: break browser.close() return items
class AutomatorMixin(object): class UnexpectedSituation(Exception): pass data_property_class = None def __init__(self, steps, data_args=[]): self.steps = steps self.data = self.data_property_class(*data_args) def run(self): options = ChromeOptions() options.add_argument('--test-type') self.driver = Chrome(chrome_options=options) self.perform_steps() self.driver.close() def find_element(self, selector): LOG.info('finding selector "%s"' % selector) return self.driver.find_element_by_css_selector(selector) @property def action_method_lookup(self): return self.get_action_method__lookup() def get_action_method__lookup(self): return { 'click': self.perform_click, 'fill_form': self.perform_fill_form, 'select_drop_down': self.perform_select_drop_down, } def get_css_selector(self, action): return action.get('css_selector') def get_action_value(self, action): if 'value' in action: value = action['value'] elif 'property' in action: property_name = action['property'] value = getattr(self.data, property_name) else: raise AutomatorMixin.UnexpectedSituation('Cannot find key "property" or "value"') return value def perform_steps(self): for step in self.steps: if 'url' in step: self.driver.get(step['url']) if 'actions' in step: self.perform_actions(step['actions']) def perform_actions(self, actions): for action in actions: action_method = self.action_method_lookup[action['type']] action_method(action) def perform_click(self, action): selector = self.get_css_selector(action) if selector: self.find_element(selector).click() return # Find by id. This will be needed when people use "." in their id names. Such as kfc's survey css_id = action['id_selector'] LOG.info(css_id) self.driver.find_element_by_id(css_id).click() def perform_fill_form(self, action): selector = self.get_css_selector(action) value = self.get_action_value(action) self.find_element(selector).send_keys(value) def perform_select_drop_down(self, action): selector = self.get_css_selector(action) value = self.get_action_value(action) Select(self.find_element(selector)).select_by_value(value)
def start_callback(): """ Main loop of the scrape. """ profile_username = E_username.get() # The Instagram username of the profile from which we # are downloading. Must be supplied. output_directory = E_path.get() # Will be initialized with the optional argument or a # default later. update_mode = True serialize = True latest_image = '' # The latest downloaded images will be the first in the directory. files = os.listdir(output_directory) if files: latest_image = files[0] # Start the browser driver = Chrome(executable_path='../bin/chromedriver') driver.get(insta_url + profile_username) # Find the number of posts on this Instagram profile post_count_tag_xpath = ('//*[@id="react-root"]/section/main/' + 'article/header/div[2]/ul/li[1]/span/span') post_count_tag = driver.find_element_by_xpath(post_count_tag_xpath) post_count = int(post_count_tag.text.replace(',', '')) # If the target profile is private, then redirect to the login page login_tag_xpath = '//*[@id="react-root"]/section/main/article/div/p/a' try: login_tag = driver.find_element_by_xpath(login_tag_xpath) login_page_url = login_tag.get_attribute('href') driver.get(login_page_url) # Wait for the user to login while driver.current_url == login_page_url: sleep(1) # Return to the target profile from the homepage driver.get(insta_url + profile_username) except: pass # Click the 'Load More' element driver.find_element_by_class_name('_oidfu').click() # Load all the posts into the browser processed = 0 while processed < post_count: # Load more content by scrolling to the bottom of the page driver.execute_script("window.scrollTo(0, 0);") driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Download 4 rows of items (4 rows are loaded upon each scroll) and # remove them from view for _ in itertools.repeat(None, 4): urls = fetch_row_links(driver) delete_row(driver) for url in urls: # Exit if we've reached the latest image that was in the # directory before downloading. This means the directory has # everything beyond this point. if update_mode: fname = file_name.search(url).group(0) if fname in latest_image: exit(0) download_from_url(url, output_directory, serialize, post_count-processed) processed += 1 driver.close()
name_text_box = browser.find_element_by_class_name("paddingUnifier") name_text_box.send_keys(venmoInfo.payee_name) name_text_box.send_keys(Keys.ENTER) payment_box = browser.find_element_by_class_name("mainTextBox") time.sleep(1) payment_box.click() datetime_now = datetime.datetime.now() SendKeys.SendKeys(venmoInfo.amount + venmoInfo.description, with_spaces=True) # click the pay button pay_button = browser.find_element_by_id("onebox_pay_toggle") pay_button.click() name_text_box = browser.find_element_by_class_name("paddingUnifier") name_text_box.send_keys(venmoInfo.payee_name) # click the send button send_button = browser.find_element_by_id("onebox_send_button") send_button.click() else: # click on the sign in link signin_link = browser.find_element_by_link_text("Sign in") signin_link.click() print("Couldn't find the cookie file, you will need two factor authorization and then cookie will be saved") # wait a while until the user fully signs in time.sleep(60) # Save the cookies pickle.dump(browser.get_cookies(), open("cookies.pkl", "wb")) time.sleep(10) browser.close()
#click on ship to this address #driver.find_element_by_xpath("//*[@id='button_ship_to']").click() #Mouse hover to Place Order Place_Order = Browser.find_element_by_xpath("//*[@id='placeOrderBtn']") hover = ActionChains(Browser).move_to_element(Place_Order) hover.perform() #placing order Browser.find_element_by_xpath("//*[@id='placeOrderBtn']").click() time.sleep(5) #Mouse hover to Place Order Place_Order = Browser.find_element_by_xpath("//*[@id='placeOrderBtn']") hover = ActionChains(Browser).move_to_element(Place_Order) hover.perform() #placing order Browser.find_element_by_xpath("//*[@id='placeOrderBtn']").click() time.sleep(13) #log-out Browser.find_element_by_xpath("//*[@id='userAccount']/a").click() Browser.find_element_by_xpath("//*[@id='userAccount']/ul/li[10]/a").click() #Close Browser Browser.close()