class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit() def test_2_add_post(self): self.browser.visit("http://127.0.0.1:8080") print("current url = ", self.browser.url) self.browser.driver.set_window_size(1920, 1080) self.browser.click_link_by_text('login') print("current url = ", self.browser.url) self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() print(self.browser.url) add_link = self.browser.find_link_by_partial_text('add') add_link.click() print(self.browser.url) title = "test_acceptance_add_post" self.browser.fill("title", title) now = datetime.datetime.now() now = str(now) self.browser.fill("content", now) button = self.browser.find_by_css("button[type=submit]") button.click() print(self.browser.url) new_post_appears = self.browser.is_text_present( title) and self.browser.is_text_present(now) print("new_post_appears = ", new_post_appears) self.assertEqual(new_post_appears, True)
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit() def test_login_correct(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_login_incorrect(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def test_add_entry(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.browser.click_link_by_text("Add Entry") self.assertEqual(self.browser.url, "http://127.0.0.1:8080/entry/add") self.browser.fill("title", "Test Title") self.browser.fill("content", "Test Content") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
class BaseWebTestCase(LiveServerTestCase): """ Abstract class to handle logic for web tests """ username = '******' password = '******' wait_seconds = 3.0 def setUp(self): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--no-sandbox") self.browser = Browser('chrome', headless=True, wait_time=10, options=chrome_options) super(BaseWebTestCase, self).setUp() def tearDown(self): self.browser.quit() try: super(BaseWebTestCase, self).tearDown() except IndexError: print("Ignoring IndexError in tearDown...") def _login(self): self._visit("") self.browser.fill('username', self.username) self.browser.fill('password', self.password) self.browser.find_by_text('Sign in').first.click() assert self.browser.is_text_present('Home') assert not self.browser.is_text_present('Sign in') def _go_home(self): self.browser.click_link_by_text('Home') time.sleep(self.wait_seconds) def _setup_confirm(self): """ First part of work-around to let phantomjs accept confirmation dialogs http://stackoverflow.com/questions/19903146/confirm-alert-window-in-phantom-js """ js_confirm = 'window.confirm = function() { return true }' self.browser.execute_script(js_confirm) def _accept_confirm(self): """ Second part of work-around to let phantomjs accept confirmation dialogs MUST call self._setup_confirm() for this to work """ self.browser.execute_script('return window.confirm') def _visit(self, path): path = self.live_server_url + path self.browser.visit(path) time.sleep(self.wait_seconds)
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit() def test_2_add_post (self): self.browser.visit("http://127.0.0.1:8080") print ("current url = ", self.browser.url) self.browser.driver.set_window_size(1920, 1080) self.browser.click_link_by_text('login') print ("current url = ", self.browser.url) self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() print (self.browser.url) add_link=self.browser.find_link_by_partial_text('add') add_link.click() print (self.browser.url) title="test_acceptance_add_post" self.browser.fill("title", title) now=datetime.datetime.now() now=str(now) self.browser.fill("content", now) button = self.browser.find_by_css("button[type=submit]") button.click() print(self.browser.url) new_post_appears=self.browser.is_text_present(title) and self.browser.is_text_present(now) print ("new_post_appears = ", new_post_appears) self.assertEqual(new_post_appears, True)
class TestViews(unittest.TestCase): def setUp(self): # Setup client self.browser = Browser('phantomjs') self.browser.driver.set_window_size(1024, 768) # Setup DB db.create_all() # Create User self.user = User(name='Alice', email='*****@*****.**', password=generate_password_hash('test')) db.session.add(self.user) db.session.commit() self.process = multiprocessing.Process(target=app.run) self.process.start() time.sleep(1) def tearDown(self): self.process.terminate() db.session.close() db.drop_all() self.browser.quit() def test_login_correct(self): self.browser.visit('http://127.0.0.1:5000/login') self.browser.fill('email', '*****@*****.**') self.browser.fill('password', 'test') self.browser.find_by_css('button[type=submit]').click() self.assertEqual(self.browser.url, 'http://127.0.0.1:5000/') def test_authenticated_add_entry(self): # do the login self.browser.visit('http://127.0.0.1:5000/login') self.browser.fill('email', '*****@*****.**') self.browser.fill('password', 'test') self.browser.find_by_css('button[type=submit]').click() # navigate to the entry add form self.browser.click_link_by_text('Add Entry') # create a new entry self.browser.fill('title', 'The Title') self.browser.fill('content', 'The Content') self.browser.find_by_css('button[type=submit]').click() # check for entry title in home self.assertIn('The Title', [e.text for e in self.browser.find_by_css('.row h1')])
def get_job_information(self, keyword, writer): count = 0 browser = Browser('phantomjs', service_args= self.proxy) soup = self.__visit_url(browser, keyword) jobs = soup.find_all('tr', attrs = {'class':'aJobS'}) count += self.__parse_data(jobs, keyword, writer) while soup.find('li', attrs = {'class':'next'}): browser.click_link_by_text('Next') data = browser.html soup = BeautifulSoup(data) jobs = soup.find_all('tr', attrs = {'class':'aJobS'}) count += self.__parse_data(jobs, keyword, writer) print count print keyword browser.quit()
def scrape(username, password): appartments = [] browser = Browser('chrome', headless=True) login(browser, username, password) browser.click_link_by_text('Lgh') links = browser.find_link_by_partial_href( 'https://nya.boplats.se/objekt/1hand/') for l in links: appartments.append(extract_table_info(browser, l)) for a in appartments: add_details(browser, a) return appartments
def login(self, name, passwd): browser = Browser(driver_name="chrome") url = 'https://www.jd.com/' browser.visit(url) browser.click_link_by_text("你好,请登录") #坑一 browser.click_link_by_text("账户登录") browser.fill("loginname", name) # 填写账户密码 browser.fill("nloginpwd", passwd) try: self.JdVerfy.get_jd_verfy_code() content = self.ocr.get_image_verfy_code(contant.img_path) print 'jd verfy code is %s', content except Exception: raise VerificationError('验证码获取异常') browser.fill('authcode', content) time.sleep(3) browser.find_by_id("loginsubmit").click()
def find_mars_hemisphere_images(): """Returns image urls of Mars Hemispheres""" #!which chromedriver executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemispheres_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') description_class = soup.find_all('div', class_='description') hemisphere_names = [] for hemispheres in description_class: hemisphere_names.append(hemispheres.find('h3').text) start_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' hemisphere_images_windows = [] for hemispheres_image in hemisphere_images: browser.click_link_by_partial_text(hemispheres_image) hemispheres_url = browser.url new_page = soup.body.find_all('div', class_='container') for sample in new_page: browser.click_link_by_text('Sample') hemisphere_images_windows.append(browser.windows) browser.visit(start_url) full_hemisphere_images = [] for full_images in hemisphere_images_windows[3]: full_hemisphere_images.append(full_images.url) full_hemisphere_image_urls = [ {"title": hemisphere_names[3], "img_url": full_hemisphere_images[1]}, {"title": hemisphere_names[2], "img_url": full_hemisphere_images[2]}, {"title": hemisphere_names[1], "img_url": full_hemisphere_images[3]}, {"title": hemisphere_names[0], "img_url": full_hemisphere_images[4]}, ] return full_hemisphere_image_urls
def patent_parser(search_exp): """@todo: Docstring for patent_parser. """ patent_list = [] b = Browser("phantomjs") b.reload() b.visit( 'http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml' ) b.fill('searchInfo', search_exp) b.click_link_by_text(u'检索') b.is_element_not_present_by_css('.s_c_conter', wait_time=8) for _ in xrange(10): item_list = b.find_by_css('.s_c_conter') for item in item_list: info_list = item.find_by_tag('td') if not urlset.has_url('patent', info_list[0].text[6:]): try: patent = Patent( id=info_list[0].text[6:], path='~', title=info_list[4].text[6:], abstract='~', inventor=info_list[7].text[5:].split(';')[:-1], applicant=info_list[6].text[10:].split(';')[:-1], category=info_list[5].text[8:].split('; '), update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())) patent_list.append(patent) print patent.id, 'new' # @todo logs except: print 'error patent' if b.is_text_present(u'下一页'): b.click_link_by_text(u'下一页') b.is_element_not_present_by_css('.s_c_conter', wait_time=8) else: break try: solr.add('patent', patent_list) except: 'err adding patent' finally: b.quit()
def patent_parser(search_exp): """@todo: Docstring for patent_parser. """ patent_list = [] b = Browser("phantomjs") b.reload() b.visit('http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml') b.fill('searchInfo', search_exp) b.click_link_by_text(u'检索') b.is_element_not_present_by_css('.s_c_conter', wait_time=8) for _ in xrange(10): item_list = b.find_by_css('.s_c_conter') for item in item_list: info_list = item.find_by_tag('td') if not urlset.has_url('patent', info_list[0].text[6:]): try: patent = Patent(id=info_list[0].text[6:], path='~', title=info_list[4].text[6:], abstract='~', inventor=info_list[7].text[5:].split(';')[:-1], applicant=info_list[6].text[10:].split(';')[:-1], category=info_list[5].text[8:].split('; '), update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())) patent_list.append(patent) print patent.id, 'new' # @todo logs except: print 'error patent' if b.is_text_present(u'下一页'): b.click_link_by_text(u'下一页') b.is_element_not_present_by_css('.s_c_conter', wait_time=8) else: break try: solr.add('patent', patent_list) except: 'err adding patent' finally: b.quit()
def traverse(url): browser = Browser() browser.visit(url) file = open("student_list.txt") enrollment_no = file.read().split() for line in enrollment_no: time.sleep(1) browser.fill("eno", line) button = browser.find_by_value("Submit").click() time.sleep(3) capture() button = browser.click_link_by_text("Back ") file.close()
def download_art(title): browser = Browser() # Visit URL url = "http://gen.lib.rus.ec/scimag/index.php" browser.visit(url) article_title = browser.find_by_name('s') article_title.fill(title) button = browser.find_by_value('Search!') # Interact with elements button.click() #sleep is use at each step to control the follow between program and internet speed time.sleep(10) browser.click_link_by_text('Libgen') time.sleep(15) browser.click_link_by_partial_href('http://gen.lib.rus.ec/scimag/get.php') time.sleep(5) browser.quit()
class AutoSearch(object): def __init__(self): self.browser = None self.base_url = 'https://www.baidu.com/' self.keywords = ['高压线下也敢飞','珍惜现在的好天气吧'] self.keywords_pngs = [] def send_png(self): ''' 发送邮件,此函数请自行实现 :return: ''' sendHtmlMail(self.keywords_pngs) def search(self,time_freq = "一天内"): self.browser = Browser(driver_name='chrome', executable_path='chromedriver.exe') self.browser.visit(self.base_url) for word in self.keywords: kw = f'+"{word}" site:bbs.wjdaily.com' if self.browser.is_element_present_by_id("kw"): self.browser.find_by_id("kw").fill(kw) time.sleep(1) self.browser.find_by_id("su").click() # self.browser.find_by_xpath('//*[@id="container"]/div[2]/div/div[1]/span[2]').first.click() time.sleep(1) if self.browser.is_element_present_by_css(".search_tool_tf",wait_time=10): self.browser.find_by_css(".search_tool_tf").first.click() if self.browser.is_element_present_by_text(time_freq): self.browser.click_link_by_text(time_freq) time.sleep(3) soup = BeautifulSoup(self.browser.html,"html.parser") no_result = soup.find('div',{'class':'nors'}) if no_result is None: print("查到结果,截图") screenshot_path = self.browser.screenshot(rf"E:\GitHub\somenzz\bbsMonitor\{word}",suffix=".png") self.keywords_pngs.append(screenshot_path)
def scrape(): # Dependencies from splinter import Browser from bs4 import BeautifulSoup import requests import pandas as pd import pymongo import time import ctypes # An included library with Python install. def Mbox(title, text, style): return ctypes.windll.user32.MessageBoxW(0, text, title, style) mars_data_dict = {} ## (1) NASA Mars News # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. # Assign the text to variables that you can reference later. # URL of page to be scraped url_nz = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module response_nz = requests.get(url_nz) # Create BeautifulSoup object; parse with 'html.parser' soup_nz = BeautifulSoup(response_nz.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_nz.prettify()) #time.sleep(2) # Find the latest News Title news_title = soup_nz.find("div", class_="content_title").a.text[1:-1] #print(news_title) # Find the latest News Paragraph Text news_p = soup_nz.find("div", class_="image_and_description_container").a.text[3:-7] #print(news_p) mars_data_dict["news_title"] = news_title mars_data_dict["news_p"] = news_p ## (2) JPL Mars Space Images - Featured Image # Use splinter to navigate the site and find the image url for the current Featured Mars Image # and assign the url string to a variable called featured_image_url. # Make sure to find the image url to the full size .jpg image. # Make sure to save a complete url string for this image. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) time.sleep(2) #dir(browser) browser.click_link_by_id('full_image') time.sleep(2) browser.click_link_by_partial_href("/spaceimages/details.") time.sleep(2) browser.click_link_by_partial_href("/spaceimages/images/largesize") time.sleep(2) featured_image_url = browser.url #print(featured_image_url) mars_data_dict["feat_img"] = featured_image_url browser.quit() ## (3) Mars Weather # Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. # Save the tweet text for the weather report as a variable called mars_weather. # URL of page to be scraped url_tweet = 'https://twitter.com/marswxreport?lang=en' # Retrieve page with the requests module response_tweet = requests.get(url_tweet) # Create BeautifulSoup object; parse with 'html.parser' soup_tweet = BeautifulSoup(response_tweet.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_tweet.prettify()) #time.sleep(2) # scrape the latest Mars weather tweet from the page tweets = soup_tweet.find_all("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text") for tweet in tweets: find_text = tweet.text.find("InSight sol") if find_text == 0: mars_weather = tweet.text #print(mars_weather) break mars_data_dict["weather"] = mars_weather ## (4) Mars Facts # URL of page to be scraped url_mfacts = 'https://space-facts.com/mars/' # Retrieve page with the requests module response_mfacts = requests.get(url_mfacts) # Create BeautifulSoup object; parse with 'html.parser' soup_mfacts = BeautifulSoup(response_mfacts.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_mfacts.prettify()) #time.sleep(2) tables = pd.read_html(url_mfacts)[1] #tables mars_data_dict["mfacts"] = tables tables.to_html("../html/mars_facts.html") ## (5) Mars Hemispheres # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres. # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. # Save both the image url string for the full resolution hemisphere image, # and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the # keys img_url and title. # Append the dictionary with the image url string and the hemisphere title to a list. # This list will contain one dictionary for each hemisphere executable_path = {"executable_path": "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False) # URL of page to be scraped url_mhemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_mhemi) time.sleep(2) # Image 1 browser.click_link_by_partial_text("Cerberus Hemisphere Enhanced") time.sleep(2) title1 = browser.title.split("|")[0] #print(title1) browser.click_link_by_text("Sample") time.sleep(2) img1_url = browser.windows[1].url #print(img1_url) time.sleep(2) browser.windows[1].close() browser.back() hemi1_dict = {} hemi1_dict["title"] = title1 hemi1_dict["img_url"] = img1_url #hemi1_dict # Image 2 browser.click_link_by_partial_text("Schiaparelli Hemisphere Enhanced") time.sleep(2) title2 = browser.title.split("|")[0] #print(title2) browser.click_link_by_text("Sample") time.sleep(2) img2_url = browser.windows[1].url #print(img2_url) time.sleep(2) browser.windows[1].close() browser.back() hemi2_dict = {} hemi2_dict["title"] = title2 hemi2_dict["img_url"] = img2_url #hemi2_dict # Image 3 browser.click_link_by_partial_text("Syrtis Major Hemisphere Enhanced") time.sleep(2) title3 = browser.title.split("|")[0] #print(title3) browser.click_link_by_text("Sample") time.sleep(2) img3_url = browser.windows[1].url #print(img3_url) time.sleep(2) browser.windows[1].close() browser.back() hemi3_dict = {} hemi3_dict["title"] = title3 hemi3_dict["img_url"] = img3_url #hemi3_dict # Image 4 browser.click_link_by_partial_text("Valles Marineris Hemisphere Enhanced") time.sleep(2) title4 = browser.title.split("|")[0] #print(title4) browser.click_link_by_text("Sample") time.sleep(2) img4_url = browser.windows[1].url #print(img4_url) time.sleep(2) browser.windows[1].close() browser.back() hemi4_dict = {} hemi4_dict["title"] = title4 hemi4_dict["img_url"] = img4_url #hemi4_dict hemisphere_image_urls = [hemi1_dict, hemi2_dict, hemi3_dict, hemi4_dict] #hemisphere_image_urls mars_data_dict["hemi_img"] = hemisphere_image_urls mars_data_dict browser.quit() Mbox("Mission to Mars Completed", "Congratulations!!! You've mined Mars!", 1)
class UploadTestCase(unittest.TestCase): def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_datastore_v3_stub() self.testbed.init_memcache_stub() self.browser = Browser('chrome') def tearDown(self): self.testbed.deactivate() def test_when_create_task_upload_file(self): #login self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") self.browser.visit("http://127.0.0.1:8080/tasks") self.browser.click_link_by_text('Create new task') self.browser.fill('title', 'title') self.browser.fill('text', 'text') self.browser.is_element_present_by_name('files[]', wait_time=10) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) #self.browser.attach_file('files[]', 'test/1.png') self.browser.find_by_css('.btn.btn-primary.start').first.click() self.assertEqual(1, len(self.browser.find_by_css('.template-download.fade.in'))) self.assertEqual(4, len(self.browser.find_by_css('.template-download.fade.in td'))) def test_when_create_task_upload_many_files(self): #login self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") self.browser.visit("http://127.0.0.1:8080/tasks") self.browser.click_link_by_text('Create new task') self.browser.fill('title', 'title') self.browser.fill('text', 'text') self.browser.is_element_present_by_name('files[]') self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) #self.browser.attach_file('files[]', 'test/1.png') self.browser.find_by_css('.btn.btn-primary.start').first.click() sleep(3) self.assertEqual(3, len(self.browser.find_by_css('.files tr.template-download')))
def Scraper(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url="https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" browser.visit(url) html=browser.html soup=bs(html,'html.parser') print(soup.prettify()) news_title=soup.find('div',class_='content_title').get_text() news_paragraph=soup.find('div',class_='article_teaser_body').get_text() news_title news_paragraph # **JPL FEATURED IMAGE** executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url="https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) html=browser.html soup=bs(html,'html.parser') images=soup.footer.find('a',class_='button fancybox')['data-fancybox-href'] url2='https://www.jpl.nasa.gov' actual_url=url2+images # **MARS WEATHER TWEETS** executable_path={'executable_path':'chromedriver.exe'} browser=Browser('chrome',**executable_path,headless=False) url='https://twitter.com/marswxreport?lang=en' browser.visit(url) html=browser.html soup=bs(html,'html.parser') relevant_tweets=soup.find_all('p',class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text') relevant_tweets weather_tweet=relevant_tweets[7].get_text() weather_tweet # **MARS FACTS** url='https://space-facts.com/mars/' tables=pd.read_html(url) tables df=tables[0] df df=df.rename(columns={0:'Charactristic',1:'Value'}) df.set_index('Charactristic') final_fact_table=df.to_html(classes='Striped-table') final_fact_table # **MARS HEMISPHERES** executable_path={'executable_path':'chromedriver.exe'} browser=Browser('chrome',**executable_path,headless=False) url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html=browser.html soup=bs(html,'html.parser') titles=[] hemi_titles=soup.find_all('h3') for i in hemi_titles: x=i.get_text() titles.append(x) titles links=[] for j in titles: browser.visit(url) browser.click_link_by_partial_text(j) browser.click_link_by_text('Sample') html=browser.html soup=bs(html,'html.parser') link=soup.find('div',class_='downloads').find('a')['href'] links.append(link) links # mars_hemis={} # for m,k in zip(titles,links): # mars_hemis[m]=k mars_hemis=[] for m,k in zip(titles,links): mars_hemis.append({"title":m,"link":k}) data={"news_title":news_title, "news_paragraph":news_paragraph, "actual_url":actual_url, "weather_tweet":weather_tweet, "final_fact_table":final_fact_table, "mars_hemis":mars_hemis} # data={"Latest Mars News Headline":news_title, # "Latest Mars News":news_paragraph, # "Featured Image":image_url, # "Latest Mars Weather Update":weather_tweet, # "Mars Fun Facts":final_fact_table, # "Mars Hemispheres":mars_hemis} return data
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def test_login_correct(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_login_incorrect(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def test_add_entry(self): # Login to blog self.test_login_correct() # Add new entry self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "test post") self.browser.fill("content", "acceptance testing post") self.browser.find_by_css("button[type=submit]").first.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_view_single_entry(self): # Login to blog self.test_login_correct() # Click on top entry title self.browser.visit("http://127.0.0.1:8080/entry/1/") self.assertEqual(self.browser.url, "http://127.0.0.1:8080/entry/1/") def test_edit_entry(self): # Login to blog self.test_login_correct() # Add new entry self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "test post") self.browser.fill("content", "acceptance testing post") self.browser.find_by_css("button[type=submit]").first.click() # Click edit link on top entry self.browser.click_link_by_partial_href('edit') # Enter new title and contents self.browser.fill("title", "edited test post") self.browser.fill("content", "edited acceptance testing post") self.browser.find_by_css("button[type=submit]").first.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_delete_entry(self): # Login to blog self.test_login_correct() # Add new entry self.browser.visit("http://127.0.0.1:8080/entry/add") self.browser.fill("title", "test post") self.browser.fill("content", "acceptance testing post") self.browser.find_by_css("button[type=submit]").first.click() # Delete entry self.browser.click_link_by_partial_href('delete') button = self.browser.find_by_css("button[type=submit]") button.click() # Make sure browser puts you back on home self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_logout(self): # Login to blog self.test_login_correct() # Click on 'Logout' link self.browser.click_link_by_text('Logout') # Check to see if 'Logout' link is visible self.assertEqual(self.browser.is_element_present_by_text('Logout'), False) # Check to see if 'Login' link is visible self.assertEqual(self.browser.is_element_present_by_text('Login'), True) def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
# In[26]: #finding mars images of hemispheres # In[27]: #high res image of cerberus hem url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') browser.click_link_by_text('Sample') img1 = browser.windows.current = browser.windows[1] img1 # In[28]: #high res image of Schiaparelli hem url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') browser.click_link_by_text('Sample') img2 = browser.windows.current = browser.windows[1] img2
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = models.User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run) self.process.start() time.sleep(1) def testLoginCorrect(self): self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") def testLoginIncorrect(self): self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/login") def testAddEditPost(self): self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.browser.visit('http://127.0.0.1:5000/post/add') self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/add") self.browser.fill("title", "First Post") self.browser.fill("content", "Hello World!") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.browser.click_link_by_text('Edit Post') self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/1/edit") self.browser.fill("title", "Edited First Post") self.browser.fill("content", "Hello Universe!") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.assertEqual(self.browser.find_by_tag('h1').first.value, "Edited First Post") #divs = self.browser.find_by_tag("div") #myList = [] #if "Hello Universe!" in divs: #myList.append("Hello Universe!") #self.assertEqual(myList[0], "Hello Universe!") def testAddDeletePost(self): self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.browser.visit('http://127.0.0.1:5000/post/add') self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/add") self.browser.fill("title", "First Post") self.browser.fill("content", "Hello World!") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.browser.click_link_by_text('Delete Post') self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/1/delete") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.assertEqual(len(self.browser.find_by_tag('h1')),0) #divs = self.browser.find_by_tag("div") #myList = [] #if "Hello Universe!" in divs: #myList.append("Hello Universe!") #self.assertEqual(myList[0], "Hello Universe!") def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
class Compass: def __init__(self, username='', password='', outdir=''): self._username = username self._password = password self._outdir = outdir self._browser = None self._record = None def quit(self): if self._browser: self._browser.quit() self._browser = None def loggin(self): prefs = { "browser.download.folderList": 2, "browser.download.manager.showWhenStarting": False, "browser.download.dir": self._outdir, "browser.helperApps.neverAsk.saveToDisk": "application/octet-stream,application/msexcel,application/csv"} self._browser = Browser('firefox', profile_preferences=prefs) self._browser.visit('https://compass.scouts.org.uk/login/User/Login') self._browser.fill('EM', self._username) self._browser.fill('PW', self._password) time.sleep(2) self._browser.find_by_value('Submit').first.click() # Look for the Role selection menu and select my Group Admin role. self._browser.is_element_present_by_name( 'ctl00$UserTitleMenu$cboUCRoles', wait_time=30) self._browser.select('ctl00$UserTitleMenu$cboUCRoles', '1253644') def export(self, section): # Select the My Scouting link. self._browser.is_text_present('My Scouting', wait_time=30) self._browser.click_link_by_text('My Scouting') def wait_then_click_xpath(xpath, wait_time=30): self._browser.is_element_present_by_xpath( xpath, wait_time=wait_time) self._browser.find_by_xpath(xpath).click() # Click the "Group Sections" hotspot. wait_then_click_xpath('//*[@id="TR_HIER7"]/h2') # Clink the link that shows the number of members in the section. # This is the one bit that is section specific. # We might be able to match on the Section name in the list, # which would make it more robust but at present we just hard # the location in the list. section_map = { 'garrick': 2, 'paget': 3, 'swinfen': 4, 'brown': 4, 'maclean': 5, 'rowallan': 6, 'somers': 7, 'boswell': 8, 'erasmus': 9, 'johnson': 10 } wait_then_click_xpath( '//*[@id="TR_HIER7_TBL"]/tbody/tr[{}]/td[4]/a'.format( section_map[section.lower()] )) # Click on the Export button. wait_then_click_xpath('//*[@id="bnExport"]') # Click to say that we want a CSV output. wait_then_click_xpath( '//*[@id="tbl_hdv"]/div/table/tbody/tr[2]/td[2]/input') time.sleep(2) # Click to say that we want all fields. wait_then_click_xpath('//*[@id="bnOK"]') download_path = os.path.join(self._outdir, 'CompassExport.csv') if os.path.exists(download_path): log.warn("Removing stale download file.") os.remove(download_path) # Click the warning. wait_then_click_xpath('//*[@id="bnAlertOK"]') # Browser will now download the csv file into outdir. It will be called # CompassExport. # Wait for file. timeout = 30 while not os.path.exists(download_path): time.sleep(1) timeout -= 1 if timeout <= 0: log.warn("Timeout waiting for {} export to download.".fomat( section )) break # rename download file. os.rename(download_path, os.path.join(self._outdir, '{}.csv'.format(section))) log.info("Completed download for {}.".format(section)) # Draw breath time.sleep(1) def load_from_dir(self): # Load the records form the set of files in self._outdir. log.debug('Loading from {}'.format(self._outdir)) def get_section(path, section): df = pd.read_csv(path, dtype=object, sep=',') df['section'] = section df['forenames_l'] = [_.lower().strip() for _ in df['forenames']] df['surname_l'] = [_.lower().strip() for _ in df['surname']] return df self._records = pd.DataFrame().append( [get_section(os.path.join(self._outdir, section), os.path.splitext(section)[0]) for section in os.listdir(self._outdir)], ignore_index=True) def find_by_name(self, firstname, lastname, section_wanted=None, ignore_second_name=True): """Return list of matching records.""" recs = self._records if ignore_second_name: df = recs[ (recs.forenames_l.str.lower().str.match( '^{}.*$'.format(firstname.strip(' ')[0].lower().strip()))) & (recs.surname_l == lastname.lower().strip())] else: df = recs[(recs.forenames_l == firstname.lower().strip()) & (recs.surname_l == lastname.lower().strip())] if section_wanted is not None: df = df[(df['section'] == section_wanted)] return df def sections(self): "Return a list of the sections for which we have data." return self._records['section'].unique() def all_yp_members_dict(self): return {s: members for s, members in self._records.groupby('section')} def section_all_members(self, section): return [m for i, m in self._records[ self._records['section'] == section].iterrows()] def section_yp_members_without_leaders(self, section): return [m for i, m in self._records[ (self._records['section'] == section) & (self._records['role'].isin( ['Beaver Scout', 'Cub Scout', 'Scout']))].iterrows()] def members_with_multiple_membership_numbers(self): return [member for s, member in self._records.groupby( ['forenames', 'surname']).filter( lambda x: len(x['membership_number'].unique()) > 1).groupby( ['forenames', 'surname', 'membership_number'])]
def scrape(): executable_path = {"executable_path": "users\spitc\anaconda3\lib\site"} browser = Browser("chrome", **executable_path, headless=False) # title url = "https://mars.nasa.gov/news/" browser.visit(url) html = browser.html soup = bs(html, "html.parser") title = soup.find("div", class_="content_title").text # paragraph browser.click_link_by_text(title) html = browser.html soup = bs(html, "html.parser") paragraph = soup.find("div", class_="wysiwyg_content") para = paragraph.find('p').text # picture url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" home_url = 'https://www.jpl.nasa.gov' browser.visit(url_image) html = browser.html soup = bs(html, "html.parser") mars_img = soup.find("li", class_="slide") mars_src = mars_img.find("a") mars_src["data-fancybox-href"] featured_image_url = home_url + mars_src["data-fancybox-href"] # weathr url_weather = "https://twitter.com/marswxreport?lang=en" browser.visit(url_weather) html = browser.html soup = bs(html, "html.parser") mars_w = soup.find("div", class_="js-tweet-text-container").text.rstrip() # facts facts_url = "https://space-facts.com/mars/" mars_table = pd.read_html(facts_url) df = mars_table[0] df.columns = ["Measurements", "Values"] df_facts = df.set_index('Measurements') mars_facts = df_facts.to_html() mars_facts_final = mars_facts.replace('\n', '') # hemi cerberus_link = mars_hemi("Cerberus", browser) schiaparelli_link = mars_hemi("Schiaparelli", browser) syrtis_major_link = mars_hemi("Syrtis Major", browser) valles_marineris_link = mars_hemi("Valles Marineris", browser) hemisphere_image_urls = [{ "title": "Valles_Marineris_Hemisphere", "img_url": valles_marineris_link }, { "title": "Cerberus_Hemisphere", "img_url": cerberus_link }, { "title": "Shiaparelli_Hemisphere", "img_url": schiaparelli_link }, { "title": "Syrtis_Major_Hemisphere", "img_url": syrtis_major_link }] mars_data = { "title": title, "paragraph": para, "mars_pic": featured_image_url, "mars_weather": mars_w, "mars_facts": mars_facts_final, hemisphere_image_urls[0]["title"]: hemisphere_image_urls[0]["img_url"], hemisphere_image_urls[1]["title"]: hemisphere_image_urls[1]["img_url"], hemisphere_image_urls[2]["title"]: hemisphere_image_urls[2]["img_url"], hemisphere_image_urls[3]["title"]: hemisphere_image_urls[3]["img_url"] } return mars_data
#!/usr/bin/python from splinter import Browser b = Browser() url = 'http://google.com' b.visit(url) b.click_link_by_text('Sign up') b.select("rateplanid", "spn") b.fill('spn_postal', '11223') b.fill('spn_email', '*****@*****.**') b.check('spn_terms') b.find_by_value('submit').first.click() b.find_by_value('submit').first.click() url = 'http://google.com'
from splinter import Browser # Your ID Password user_email = "nishnik" user_pass = "******" browser= Browser('firefox') browser.visit('http://www.facebook.com') browser.fill('email', user_email) browser.fill('pass', user_pass) button = browser.find_by_id('loginbutton') button.click() # Paste the url you need to download from. Note: It must be from mobile site browser.visit('https://m.facebook.com/photo.php?fbid=780845462017409&id=100002758879147&set=oa.876940942416747&relevant_count=1&source=48&refid=18&_ft_=qid.6274517251577062760%3Amf_story_key.876940939083414%3Atl_objid.876940939083414') # The number of consecutive pics you have to download NUM_PICS = 56 i = 0 while i < NUM_PICS: i = i + 1 browser.click_link_by_text('View full size') browser.screenshot("screenshot" + str(i) + ".png") browser.back() browser.click_link_by_text('Next') browser.quit()
def scrape(): print('INITIALIZING DATA SCRAPE FOR YOSEMITE NATIONAL PARK') print('-------------------------------------------------------') # initialize browser # executable_path = {'executable_path': 'chromedriver.exe'} # use executable path below for mac executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # dictionary to hold final scraped data yosemite_data = {} print('COMMENCING DATA SCRAPE FOR ECONOMIC BENEFITS INFO') # URL of yosemite articles page to be scraped url = 'https://www.nps.gov/yose/learn/news/newsreleases.htm' browser.visit(url) time.sleep(2) # empty lists to hold raw scraped data article_links = [] headlines = [] article_contents = [] # empty lists that will hold cleaned scraped data years = [] amounts = [] job_counts = [] visitor_counts = [] # empty list to hold final scraped data economic_benefits = [] # go through pages 1-33 and find links of targeted articles for x in range(1, 34): html = browser.html soup = BeautifulSoup(html, 'html.parser') article_snippets = soup.find_all('li', class_='ListingList-item') substring = 'Economic Benefit' for article_snippet in article_snippets: snippet_headline = article_snippet.find( 'h3', class_='ListingResults-title').text if substring in snippet_headline: end_link = article_snippet.find('a')['href'] article_link = 'https://www.nps.gov' + end_link article_links.append(article_link) browser.click_link_by_text('Next ') time.sleep(1) # visit each article link and extract content for article_link in article_links: browser.visit(article_link) article_html = browser.html article_soup = BeautifulSoup(article_html, 'html.parser') headline = article_soup.find('div', class_='ContentHeader').text headline = headline.replace('\n', '') headlines.append(headline) article_content = article_soup.find('div', class_='ArticleTextGroup').text article_contents.append(article_content) # loop through headlines and extract economic benefit $ amount (in millions) for headline in headlines: headline_split = headline.split('$')[1] amount = headline_split[:3] amounts.append(amount) # loop through article contents and extract year, job count, and visitor count for article_content in article_contents: year_split = article_content.split('Park in ')[1] year = year_split[:4] years.append(year) job_split = article_content.split('supported ')[1] job_count = job_split[:5] if ',' in job_count: job_count = job_count.replace(',', '') job_counts.append(job_count) elif ' ' in job_count: job_count = job_count.replace(' ', '') job_counts.append(job_count) else: job_counts.append(job_count) visitor_split = article_content.split('shows that')[1] visitor_count = visitor_split[:10] visitor_count = visitor_count.replace(',', '').replace('\xa0', '').replace(' ', '') visitor_counts.append(visitor_count) # append extract information into economic_benefits dictionary economic_benefits.append({ 'years': years, 'amounts': amounts, 'job_counts': job_counts, 'visitor_counts': visitor_counts }) # append missing 2015 data economic_benefits[0]['years'].insert(2, '2015') economic_benefits[0]['amounts'].insert(2, '594') economic_benefits[0]['job_counts'].insert(2, '6890') economic_benefits[0]['visitor_counts'].insert(2, '4150217') # append to yosemite_data dictionary yosemite_data['economic_benefits'] = economic_benefits print('OBTAINED ECONOMIC BENEFITS') print('-------------------------------------------------------') print('COMMENCING DATA SCRAPE FOR TRAIL HEAD POSTS') # URL of page to be scraped url = 'https://www.hikespeak.com/sierras/yosemite/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') # Examine the results, then determine element that contains sought info # results are returned as an iterable list results = soup.find_all("tr") post = [] ## Probably need a loop here for all 20 rows # Loop through returned results for result in results: # Error handling try: # Identify and return title of listing trail = result.find("td", class_="column-2").text distance = result.find("td", class_="column-3").text coordinates = result.find("td", class_="column-4").text # Run only if title, price, and link are available if (trail and distance and coordinates): # Print results print('-------------') print(trail) print(distance) print(coordinates) post.append({ 'trail': trail, 'distance': distance, 'coordinates': coordinates }) except Exception as e: print(e) yosemite_data['post'] = post print('OBTAINED TRAIL HEAD POSTS') print('-------------------------------------------------------') print('COMMENCING DATA SCRAPE FOR TRAIL TABLE') # URL of page to be scraped trail_table_url = 'https://www.yosemitehikes.com/hikes.htm' # Retrieve page with the requests module response = requests.get(trail_table_url) # Create BeautifulSoup object; parse with 'lxml' soup = BeautifulSoup(response.text, 'lxml') # Examine the results, then determine element that contains sought info # results are returned as an iterable list trail_table_results = soup.find_all('tr') trail_table_post = [] # Loop through returned results for trail_table_result in trail_table_results: # Error handling try: # Identify and return trail name trail_name = trail_table_result.find('td', column='Trail').text # Identify and return trail's distance raw_distance = result.find('td', column="Distance (miles/km)").text if ' (' in raw_distance: distance = str(raw_distance[:raw_distance.find(" (")]) else: distance = result.find('td', column="Distance (miles/km)").text # Identify and return trail's elevation try: raw_elevation = result.find( 'td', column="Elevation Gain (feet/meters)").text elevation = str(raw_elevation[:raw_elevation.find(" (")]) if ',' in elevation: elevation = elevation.replace(',', '') else: elevation = raw_elevation[:raw_elevation.find(" (")] except Exception as elevation: elevation = result.find( 'td', column="Elevation Gain (feet/meters)").text # Identify and return trail's crowd rating crowd = str(trail_table_result.find('td', column="Crowd Factor"))[44] # Identify and return trail's scenery rating scenery = str( trail_table_result.find('td', column="Scenery Factor"))[-14] # Identify and return trail's difficulty rating difficulty = str(trail_table_result.find('td', column="Difficulty"))[-14] #Dictionary to be inserted as a MongoDB document trail_table_post.append({ 'trail_name': trail_name, 'distance': distance, 'elevation': elevation, 'crowd': crowd, 'scenery': scenery, 'difficulty': difficulty }) except Exception as e: print(e) yosemite_data['trail_table_post'] = trail_table_post print('OBTAINED TRAIL TABLE') print('-------------------------------------------------------') print('COMMENCING DATA SCRAPE FOR WEATHER') current_weather = [] apikey = api_key.api_key location = "Yosemite Valley" url = "http://api.openweathermap.org/data/2.5/weather?units=Imperial&appid=" + apikey + "&q=" + location weather = requests.get(url).json() todays_temp = weather["main"]["temp"] todays_humid = weather["main"]["humidity"] todays_cloud = weather["clouds"]["all"] todays_wind = weather["wind"]["speed"] converted = datetime.utcfromtimestamp(weather["dt"]) local_time = converted - timedelta(hours=7, minutes=0) weather_date = local_time.strftime("%B %d, %Y") current_weather.append({ 'todays_temp': todays_temp, 'todays_humid': todays_humid, 'todays_cloud': todays_cloud, 'todays_wind': todays_wind, 'weather_date': weather_date }) yosemite_data['weather'] = current_weather print('OBTAINED WEATHER') print('-------------------------------------------------------') print('COMMENCING DATA SCRAPE FOR TWITTER') auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) target_user = "******" user_tweets_only = api.user_timeline(target_user, count=1, result_type="recent") user_tweets = user_tweets_only[0]["text"] yosemite_data['tweet'] = user_tweets print('OBTAINED TWEET') print('-------------------------------------------------------') print('COMMENCING DATA SCRAPE FOR MOST RECENT NEWS') url = 'https://www.nps.gov/yose/learn/news/newsreleases.htm' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') recent_news = [] # Article Title news_title = soup.find("h3", class_="ListingResults-title").text # Article Date article_date = soup.find("div", class_="ListingMeta").text # Link to full article results = soup.find("li", class_="ListingList-item ListingResults-item") news_link = results.find("a")["href"] split_url = urlsplit(url) full_news_link = split_url.scheme + "://" + split_url.netloc + news_link # Article summary article_text = soup.find("p", class_="ListingResults-description").text recent_news.append({ 'news_title': news_title, 'article_date': article_date, 'full_news_link': full_news_link, 'article_text': article_text }) yosemite_data['recent_news'] = recent_news print('OBTAINED MOST RECENT NEWS') print('SCRAPING COMPLETED') print('-------------------------------------------------------') print(yosemite_data) return yosemite_data
hemisphere2 = soup.find_all('a', class_="itemLink product-item")[3]['href'] hemisphere3 = soup.find_all('a', class_="itemLink product-item")[5]['href'] hemisphere4 = soup.find_all('a', class_="itemLink product-item")[7]['href'] # Create a single list called 'mars4hemis' mars4hemis = [hemisphere1, hemisphere2, hemisphere3, hemisphere4] mars4hemis # Go to each of the 4 hemisphere websites and scrape the link for the Sample Image usgs_url = "https://astrogeology.usgs.gov" image_url = [] titles = [] for x in range(len(mars4hemis)): # Go to the hemisphere website browser.visit(usgs_url + mars4hemis[x]) browser.click_link_by_text("Open") time.sleep(2) # CLick Sample link to get the image sample = browser.find_by_text('Sample') image = sample['href'] image_url.append(image) # Search the h2 tags to get the title headers = browser.find_by_tag('h2') full_title = headers.text title = full_title.strip('Enhanced') titles.append(title) # print(browser.url) print(title, image) # Show the two newly created lists: titles and image_url print(titles)
class InstaLiker(): # constructor def __init__(self): self.mUrl = "https://www.instagram.com/" self.cycles = 4 self.browser = Browser() self.username = "******" self.pw = 'xxxxxxxxxxxxxxxx\r' self.totalLikes = 0 self.blackList = ["make a list of users to exclude", "including your own username" ] # scroll the page and # do the liking def launchPage(self): self.browser.visit(self.mUrl) self.login() self.scrollBy() for i in range(0, self.cycles): self.likePosts() print("just liked " + str(self.totalLikes) + " pix...Yay!") def login(self): print("login") print("logging in as " + self.username) self.browser.click_link_by_text('Log in') self.browser.fill('username', self.username) self.browser.fill('password', self.pw) form = self.browser.find_by_tag('form') inputs = form.find_by_tag('button') inputs[0].click() # need to sleep a few seconds here time.sleep(5) def likePosts(self): print("liking posts") likeList = self.browser.find_by_text("Like") if len(likeList) == 0: print("nothing left to like. attempt to scroll farther to load more posts.") self.scrollBy() time.sleep(3) likeList = self.browser.find_by_text("Like") print("likeList is now: " + str(len(likeList))) if (len(likeList) > 0): print("found " + str(len(likeList)) + " posts to like") for foo in likeList: tmpParentNode = foo.find_by_xpath("ancestor::article/header") print(tmpParentNode["innerText"]) if self.checkBlackList(tmpParentNode["innerText"]) == 0: foo.click() self.totalLikes += 1 time.sleep(1) def checkBlackList(self, pString): for foo in self.blackList: if foo in pString: print("found blacklisted item '" + foo + "'") return 1 return 0 def scrollBy(self): print("scrolling down.") self.browser.execute_script( "window.scrollBy(0,30000);" ) time.sleep(1) def boneyard(self): print('boneyard')
def scrape(): url = 'https://mars.nasa.gov/news/' browser = webdriver.Chrome('chromedriver.exe') browser.get(url) html = browser.page_source soup = bs(html, 'html.parser') browser.close() soup_li = soup.find_all('li', class_='slide') list_of_titles = [] list_of_paragraphs = [] for eachslide in soup_li: one_title = eachslide.find('div', class_='content_title').text one_paragraph = eachslide.find('div', class_='article_teaser_body').text list_of_titles.append(one_title) list_of_paragraphs.append(one_paragraph) ######## NEWS TITLE AND PARAGRAPHS LOCATION ######### news_title = list_of_titles[0] news_p = list_of_paragraphs[0] ##################################################### splint_browser = Browser('chrome', executable_path='chromedriver.exe', headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' splint_browser.visit(url) splint_browser.click_link_by_partial_text('FULL IMAGE') time.sleep(2) splint_browser.click_link_by_partial_text('more info') html = splint_browser.html soup = bs(html, 'html.parser') splint_browser.quit() image_src = soup.find_all('figure', class_='lede') for each in image_src: ######### FEATURED IMAGE URL ######################### featured_image_url = 'https://www.jpl.nasa.gov' + each.a['href'] ####################################################### # Twitter API Keys consumer_key = apikeys.TWITTER_CONSUMER_KEY consumer_secret = apikeys.TWITTER_CONSUMER_SECRET access_token = apikeys.TWITTER_ACCESS_TOKEN access_token_secret = apikeys.TWITTER_ACCESS_TOKEN_SECRET # Setup Tweepy API Authentication auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) public_tweets = api.user_timeline('marswxreport', count=5) for tweet in public_tweets: ########## MARS WEATHER TWEET ########################## if (("hPa" in tweet['text']) and ("Sol" in tweet['text'])): mars_weather = tweet['text'] break ######################################################### url_tables = 'https://space-facts.com/mars/' tables = pd.read_html(url_tables) table_df = pd.DataFrame(tables[0]) table_df = table_df.rename(columns={0: "planet_profile", 1: "mars_data"}) table_df = table_df.set_index('planet_profile') ############### TABLE WITH MARS INFORMATION ############### table_html = pd.DataFrame.to_html(table_df) ########################################################### splint_browser = Browser('chrome', executable_path='chromedriver.exe', headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' splint_browser.visit(url) hemisphere_list = [] html = splint_browser.html soup = bs(html, 'html.parser') findHemisphere = soup.find_all('div', class_='item') for each in findHemisphere: hemisphere_list.append(each.h3.text) splint_browser.quit() hemisphere_image = [] for eachHemi in hemisphere_list: splint_browser = Browser('chrome', executable_path='chromedriver.exe', headless=False) splint_browser.visit(url) time.sleep(2) splint_browser.click_link_by_partial_text(eachHemi) time.sleep(2) splint_browser.click_link_by_text('Sample') #Line 23 splint_browser.windows.current = splint_browser.windows[1] #Line 24 html = splint_browser.html soup = bs(html, 'html.parser') splint_browser.quit() #Line 25 hemi_image = soup.body.find('img')['src'] hemisphere_image.append(hemi_image) ################ HEMISPHERES IMAGES - URL ######################## title_image_url = [] title_image_tuple = zip(hemisphere_list, hemisphere_image) for each in title_image_tuple: temp_dict = {} temp_dict['title'] = each[0] temp_dict['img_url'] = each[1] title_image_url.append(temp_dict) ################################################################### mars_dict = { 'News_Title': news_title, 'News_Paragraph': news_p, 'Featured_Image': featured_image_url, 'Mars_Weather': mars_weather, 'Mars_Info': table_html, 'Hemisphere_Images': title_image_url } return mars_dict
#!/usr/bin/python import random from splinter import Browser email = str(random.randrange(10000000, 99999999)) + '@comcast.com' zip_code = random.randrange(10000, 99999) url = 'http://captive.apple.com' browser = Browser('firefox') browser.visit(url) browser.click_link_by_text('Sign up') browser.select("rateplanid", "spn") browser.fill('spn_postal', zip_code) browser.fill('spn_email', email) browser.check('spn_terms') browser.find_by_value('submit').first.click() browser.find_by_value('submit').first.click() browser.quit()
def scrape(): browser=Browser("chrome", **executable_path, headless=False) #NASA Mars News url='https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") news=soup.find('ul',class_='item_list').find_all('li',class_='slide') for i in news[:1]: news_title=i.find('h3').text news_p=i.find('div',class_='article_teaser_body').text print(news_title,'\n',news_p) #JPL Mars Space Images - Featured Image url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(1) html=browser.html soup=BeautifulSoup(html,'html.parser') photos=soup.find('ul',class_='articles').find_all('li',class_='slide') for i in photos[:1]: partial_link=i.find('div',class_='img').find('img')['src'] featured_image_url='https://www.jpl.nasa.gov/'+partial_link print(featured_image_url) #Mars Facts url='https://space-facts.com/mars/' browser.visit(url) time.sleep(1) html=browser.html soup=BeautifulSoup(html,'html.parser') tables=pd.read_html(html) table0=tables[0] table0.columns=['Feature','Value'] table0['Feature']=table0['Feature'].apply(lambda x:x.replace(":",'')) print(table0) table0.set_index('Feature',inplace=True) table0.to_html('Mars_Facts.html') table_data=table0.reset_index().values.tolist() #Mars Hemispheres url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) time.sleep(1) html=browser.html soup=BeautifulSoup(html,'html.parser') results=soup.find('div',id='product-section').find_all('div',class_='item') names_list=[] img_link_list=[] for result in results: result_link='https://astrogeology.usgs.gov/'+result.find('a')['href'] name=result.find('h3').text names_list.append(name) browser.visit(result_link) time.sleep(1) html=browser.html soup=BeautifulSoup(html,'html.parser') browser.click_link_by_text('Open') time.sleep(1) html=browser.html soup=BeautifulSoup(html,'html.parser') img_link=soup.find('div',class_='downloads').find('a')['href'] img_link_list.append(img_link) hemisphere_image_urls = [] for name,link in zip(names_list,img_link_list): hemisphere_image_urls.append({'title':name,'img_url':link}) browser.quit() mars_dict={'News_Title':news_title,'News_Summary':news_p,"Mars_img":featured_image_url,\ "Mars_Facts":table_data,"Mars_Hemispheres":hemisphere_image_urls} return mars_dict
def scrape(): mars_all = {} #MARS NEWS executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) #Prepare empty list for headlines and paragraphs news_info = [] html = browser.html soup = bs(html, 'lxml') news_title = soup.find("div", class_='content_title').find('a').text news_paragraph = soup.find("div", class_='article_teaser_body').text news_info.append({"Headline": news_title, "Paragraph": news_paragraph}) mars_all['news_title'] = news_title mars_all['news_paragraph'] = news_paragraph #MARS IMAGE executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'lxml') #Define the base image URL of high-res image base_imgurl = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/' #Locate the image, strip into components, and get only the 8-digit image name image_name = soup.find('div', class_='img').find('img')['src'] image_name = image_name.split("/")[-1:][0][0:8] #Concatenate the image URL components featured_image_url = base_imgurl + image_name + '_hires.jpg' mars_all['featured_image_url'] = featured_image_url #MARS TWEET executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html soup = bs(html, 'lxml') mars_weather = soup.find('div', class_='js-tweet-text-container').find('p').text mars_all['mars_weather'] = mars_weather #MARS FACTS url = 'http://space-facts.com/mars/' tables = pd.read_html(url) #Specify column titles for fact table mars_facts = tables[0] mars_facts.columns = ['Statistic', 'Detail'] #Convert DataFrame to HTML mars_facts = mars_facts.to_html() mars_all['mars_facts'] = mars_facts #MARS HEMISPHERES executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) #Prepare empty list to store dictionary of image links and titles mars_hemispheres = [] #Loop through 4 hemispheres for i in range(4): browser.find_by_css("a.product-item h3")[i].click() #Get the current HTML page structure html = browser.html soup = bs(html, 'lxml') #Identified from the enhanced image, this is the base URL... base_url = 'https://astrogeology.usgs.gov' #Each hemisphere image location found here...store it in a variable hemisphere_image = soup.find_all('img', class_='wide-image')[0]['src'] image_link = base_url + hemisphere_image #Store the title in a variable...need to remove ' Enhanced' hemisphere_title = soup.find('h2', class_='title').text.replace( ' Enhanced', '') #Append image and title to a dictionary and append to list mars_hemispheres.append({ 'Hemisphere': hemisphere_title, 'ImageURL': image_link }) mars_all['hemispheres'] = mars_hemispheres #Back to previous page to loop through other hemispheres. browser.click_link_by_text('Back') return (mars_all)
class TestRoutes(unittest.TestCase): @classmethod def setUpClass(cls): # socketio.run(flapp, port = flapp.config['PORT']) pass # @init_db def setUp(self): self.browser = Browser() self.username = uuid4().hex self.userpassword = uuid4().hex def tearDown(self): self.browser.quit() def go_home(self): self.browser.visit( 'http://localhost:%s'%flapp.config['PORT']) def login(self, user): self.go_home() self.browser.fill_form({'username': self.username, 'password':self.userpassword}) self.browser.find_by_value('Sign in').click() def test_login_success_with_confirmed_user(self): self.login(UserFactory.seed_confirmed_user(self.username, self.userpassword)) assert self.browser.is_text_present('Signed in as %s'%self.username) def test_login_failure_with_nonconfirmed_user(self): user = UserFactory.seed_nonconfirmed_user(self.username, self.userpassword) self.login(user) assert self.browser.is_text_not_present('Signed in as %s'%self.userpassword) assert self.browser.is_text_present('Sign in') def test_login_failure_with_nonexisting_user(self): self.go_home() fake_username = uuid4().hex self.browser.fill_form({'username': fake_username, 'password':uuid4().hex}) self.browser.find_by_value('Sign in').click() assert self.browser.is_text_not_present('Signed in as %s'%fake_username) assert self.browser.is_text_present('Sign in') def test_logout(self): self.login(UserFactory.seed_confirmed_user(self.username, self.userpassword)) self.browser.click_link_by_text('Sign out') assert self.browser.is_text_not_present('Signed in as %s'%self.username) assert self.browser.is_text_present('Sign in') # def test_index(self): # r = flapp.get('/') # assert r.okbr # def test_login_with_id(self): # self.login() # print self.browser.html # # print browser.html # class TestRoutesHeadless(unittest.TestCase): # def setUp(self): # self.browser
class Compass: def __init__(self, username='', password='', outdir=''): self._username = username self._password = password self._outdir = outdir self._browser = None self._record = None def quit(self): if self._browser: self._browser.quit() self._browser = None def loggin(self): prefs = { "browser.download.folderList": 2, "browser.download.manager.showWhenStarting": False, "browser.download.dir": self._outdir, "browser.helperApps.neverAsk.saveToDisk": "application/octet-stream,application/msexcel,application/csv"} self._browser = Browser('chrome') #, profile_preferences=prefs) self._browser.visit('https://compass.scouts.org.uk/login/User/Login') self._browser.fill('EM', self._username) self._browser.fill('PW', self._password) time.sleep(1) self._browser.find_by_text('Log in').first.click() # Look for the Role selection menu and select my Group Admin role. self._browser.is_element_present_by_name( 'ctl00$UserTitleMenu$cboUCRoles', wait_time=30) self._browser.select('ctl00$UserTitleMenu$cboUCRoles', '1253644') time.sleep(1) def wait_then_click_xpath(self, xpath, wait_time=30, frame=None): frame = self._browser if frame is None else frame while True: try: if frame.is_element_present_by_xpath(xpath, wait_time=wait_time): frame.find_by_xpath(xpath).click() break else: log.warning("Timeout expired waiting for {}".format(xpath)) time.sleep(1) except: log.warning("Caught exception: ", exc_info=True) def wait_then_click_text(self, text, wait_time=30, frame=None): frame = self._browser if frame is None else frame while True: if frame.is_text_present(text, wait_time=wait_time): frame.click_link_by_text(text) break else: log.warning("Timeout expired waiting for {}".format(text)) def adult_training(self): self.home() # Navigate to training page a show all records. self.wait_then_click_text('Training') time.sleep(1) self.wait_then_click_text('Adult Training') time.sleep(1) self.wait_then_click_xpath('//*[@id="bn_p1_search"]') def home(self): # Click the logo to take us to the top self.wait_then_click_xpath('//*[@alt="Compass Logo"]') time.sleep(1) def search(self): self.home() # Click search button self.wait_then_click_xpath('//*[@id="mn_SB"]') time.sleep(1) # Click "Find Member(s)" self.wait_then_click_xpath('//*[@id="mn_MS"]') time.sleep(1) # Navigate to training page a show all records. with self._browser.get_iframe('popup_iframe') as i: self.wait_then_click_xpath('//*[@id="LBTN2"]', frame=i) time.sleep(1) self.wait_then_click_xpath('//*[@class="popup_footer_right_div"]/a', frame=i) time.sleep(1) def lookup_member(self, member_number): self.home() # Click search button self.wait_then_click_xpath('//*[@id="mn_SB"]') time.sleep(1) xpath = '//*[@id="CNLookup2"]' while True: try: if self._browser.is_element_present_by_xpath(xpath, wait_time=30): self._browser.find_by_xpath(xpath).fill(member_number) break else: log.warning("Timeout expired waiting for {}".format(xpath)) time.sleep(1) except: log.warning("Caught exception: ", exc_info=True) self.wait_then_click_xpath('//*[@id="mn_QS"]') def fetch_table(self, table_id): parser = etree.HTMLParser() def columns(row): return ["".join(_.itertext()) for _ in etree.parse(StringIO(row.html), parser).findall('/*/td')] def headers(row): return ["".join(_.itertext()) for _ in etree.parse(StringIO(row.html), parser).findall('/*/td')] headers_xpath = '//*[@id ="{}"]/thead/*'.format(table_id) table_xpath = '//*[@id ="{}"]/tbody/tr[not(@style="display: none;")]'.format(table_id) if self._browser.is_element_present_by_xpath(table_xpath, wait_time=5): headings = [headers(row) for row in self._browser.find_by_xpath(headers_xpath)][0] records = [columns(row) for row in self._browser.find_by_xpath(table_xpath)] # Extend the length of each row to the same length as the columns records = [row+([None] * (len(headings)-len(row))) for row in records] # And add dummy columns if we do not have enough headings headings = headings + ["dummy{}".format(_) for _ in range(0,len(records[0]) - len(headings))] return pd.DataFrame.from_records(records, columns=headings) log.warning("Failed to find table {}".format(table_id)) return None def member_training_record(self, member_number, member_name): self.lookup_member(member_number) # Select Training record self.wait_then_click_xpath('//*[@id="LBTN5"]') personal_learning_plans = self.fetch_table('tbl_p5_TrainModules') personal_learning_plans['member'] = member_number personal_learning_plans['name'] = member_name training_record = self.fetch_table('tbl_p5_AllTrainModules') training_record['member'] = member_number training_record['name'] = member_name mandatory_learning = self.fetch_table('tbl_p5_TrainOGL') mandatory_learning['member'] = member_number mandatory_learning['name'] = member_name return personal_learning_plans, personal_learning_plans, mandatory_learning def member_permits(self, member_number, member_name): self.lookup_member(member_number) # Select Permits self.wait_then_click_xpath('//*[@id="LBTN4"]') permits = self.fetch_table('tbl_p4_permits') if permits is not None: permits['member'] = member_number permits['name'] = member_name return permits @lru_cache() def get_all_adult_trainers(self): self.adult_training() return self.fetch_table('tbl_p1_results') @lru_cache() def get_all_group_members(self): self.search() self._browser.is_element_present_by_xpath('//*[@id = "MemberSearch"]/tbody', wait_time=10) time.sleep(1) # Hack to ensure that all of the search results loaded. for i in range(0, 5): self._browser.execute_script( 'document.getElementById("ctl00_main_working_panel_scrollarea").scrollTop = 100000') time.sleep(1) return self.fetch_table('MemberSearch') def export(self, section): # Select the My Scouting link. self._browser.is_text_present('My Scouting', wait_time=30) self._browser.click_link_by_text('My Scouting') # Click the "Group Sections" hotspot. self.wait_then_click_xpath('//*[@id="TR_HIER7"]/h2') # Clink the link that shows the number of members in the section. # This is the one bit that is section specific. # We might be able to match on the Section name in the list, # which would make it more robust but at present we just hard # the location in the list. section_map = { 'garrick': 2, 'paget': 3, 'swinfen': 4, 'brown': 4, 'maclean': 5, 'rowallan': 6, 'somers': 7, 'boswell': 8, 'erasmus': 9, 'johnson': 10 } self.wait_then_click_xpath( '//*[@id="TR_HIER7_TBL"]/tbody/tr[{}]/td[4]/a'.format( section_map[section.lower()] )) # Click on the Export button. self.wait_then_click_xpath('//*[@id="bnExport"]') # Click to say that we want a CSV output. self.wait_then_click_xpath( '//*[@id="tbl_hdv"]/div/table/tbody/tr[2]/td[2]/input') time.sleep(2) # Click to say that we want all fields. self.wait_then_click_xpath('//*[@id="bnOK"]') download_path = os.path.join(self._outdir, 'CompassExport.csv') if os.path.exists(download_path): log.warn("Removing stale download file.") os.remove(download_path) # Click the warning. self.wait_then_click_xpath('//*[@id="bnAlertOK"]') # Browser will now download the csv file into outdir. It will be called # CompassExport. # Wait for file. timeout = 30 while not os.path.exists(download_path): time.sleep(1) timeout -= 1 if timeout <= 0: log.warn("Timeout waiting for {} export to download.".fomat( section )) break # rename download file. os.rename(download_path, os.path.join(self._outdir, '{}.csv'.format(section))) log.info("Completed download for {}.".format(section)) # Draw breath time.sleep(1) def load_from_dir(self): # Load the records form the set of files in self._outdir. log.debug('Loading from {}'.format(self._outdir)) def get_section(path, section): df = pd.read_csv(path, dtype=object, sep=',') df['section'] = section df['forenames_l'] = [_.lower().strip() for _ in df['forenames']] df['surname_l'] = [_.lower().strip() for _ in df['surname']] return df self._records = pd.DataFrame().append( [get_section(os.path.join(self._outdir, section), os.path.splitext(section)[0]) for section in os.listdir(self._outdir)], ignore_index=True) def find_by_name(self, firstname, lastname, section_wanted=None, ignore_second_name=True): """Return list of matching records.""" recs = self._records if ignore_second_name: df = recs[ (recs.forenames_l.str.lower().str.match( '^{}.*$'.format(firstname.strip(' ')[0].lower().strip()))) & (recs.surname_l == lastname.lower().strip())] else: df = recs[(recs.forenames_l == firstname.lower().strip()) & (recs.surname_l == lastname.lower().strip())] if section_wanted is not None: df = df[(df['section'] == section_wanted)] return df def sections(self): "Return a list of the sections for which we have data." return self._records['section'].unique() def all_yp_members_dict(self): return {s: members for s, members in self._records.groupby('section')} def section_all_members(self, section): return [m for i, m in self._records[ self._records['section'] == section].iterrows()] def section_yp_members_without_leaders(self, section): return [m for i, m in self._records[ (self._records['section'] == section) & (self._records['role'].isin( ['Beaver Scout', 'Cub Scout', 'Scout']))].iterrows()] def members_with_multiple_membership_numbers(self): return [member for s, member in self._records.groupby( ['forenames', 'surname']).filter( lambda x: len(x['membership_number'].unique()) > 1).groupby( ['forenames', 'surname', 'membership_number'])]
def download(link): browser = Browser() browser.visit('https://www.ssyoutube.com' + link) time.sleep(22) print("OPENED") browser.click_link_by_text('Download')
browser.find_by_name('name') browser.find_by_text('Hello World!') browser.find_by_id('firstheader') browser.find_by_value('query') # get element first_found = browser.find_by_name('name').first last_found = browser.find_by_name('name').last second_found = browser.find_by_name('name')[1] # Get value of an element browser.find_by_css('h1').first.value # Clicking links,return the first link browser.click_link_by_href('http://www.the_site.com/my_link') browser.click_link_by_partial_href('my_link') browser.click_link_by_text('my link') browser.click_link_by_partial_text('part of link text') browser.click_link_by_id('link_id') # element is visible or invisible browser.find_by_css('h1').first.visible #fill content browser.find_by_id('productName').fill( 'splinter - python acceptance testing for web applications') browser.fill('q', 'splinter - python acceptance testing for web applications') # Verifying if element has a className browser.find_by_css('.content').first.has_class('content') # click button browser.find_by_name('send').first.click()
class DownPatent(object): def __init__(self, db, down_url): self.db = db self.down_url = down_url self.browser = Browser("phantomjs", wait_time=10) #self.browser = Browser() #下载专利 def download(self, patentno): #访问网页 #网页加载超时 #down_flag, 0:未下载,1:不存在,2:下载失败 download_link = "" down_flag = 0 if True: print "打开网页" self.browser.visit(self.down_url) if not self.browser.is_element_not_present_by_value("查询", wait_time=10): #填写专利号 self.browser.fill("cnpatentno", patentno) self.browser.find_by_value("查询").first.click() print "填写专利号" #连接超时,404 if self.browser: print "打开验证码网页" #一个最多循环20次 code_handler = CodeHandler() #填写验证码 list_fill_text = [] #验证码路径 list_code_path = [] #验证码分割标志 list_split_flag = [] #验证码识别标志 list_reg_flag = [] for code_num in xrange(20): print code_num #查找验证码 if not self.browser.is_element_not_present_by_id("getcode", wait_time=5): print "查找验证码" #截图 #self.browser.driver.maximize_window() self.browser.driver.save_screenshot("screenshot.png") #获取验证码图片 image = Image.open("screenshot.png") image_location = self.find_location(image) image_code = image.crop((image_location[0], image_location[1], image_location[0]+52, image_location[1]+21)) save_path = "static/images/onlinecode/" + time.ctime() + ".png" save_path_temp = "../%s" % save_path image_code.save(save_path_temp) list_code_path.append(save_path) #分割图片 list_split_image = self.deal_split(code_handler, image_code) #识别,如果能正确识别,则识别,不能,则重新获取验证码 if len(list_split_image) == 4: print "正确分割" list_split_flag.append(1) reg_plain_text = self.reg_code(list_split_image) fill_text = "".join(reg_plain_text) list_fill_text.append(fill_text) #填写验证码 #hand_fill_text = raw_input("Enter fill text:") self.browser.fill("ValidCode", fill_text) self.browser.find_by_value("确定").first.click() print self.browser.html.encode("utf-8").find("验证码输入错误") if self.browser.html.encode("utf-8").find("验证码输入错误") == -1: list_reg_flag.append(1) if self.browser.html.encode("utf-8").find("没有找到该专利") == -1: down_link_one = self.browser.find_link_by_text("申请公开说明书图形下载(标准版)") down_link_two = self.browser.find_link_by_text("申请公开说明书图形下载(极速版)") if down_link_one or down_link_two: print "查找说明书图形下载链接" list_reg_flag.append(1) if down_link_one: self.browser.click_link_by_text("申请公开说明书图形下载(标准版)") else: self.browser.click_link_by_text("申请公开说明书图形下载(极速版)") print "查找下载链接" #查找下载链接 download_a = self.browser.find_link_by_text("下载专利") if download_a: download_link = download_a["href"] #找到下载链接 down_flag = 3 break else: print "下载失败" #下载失败 down_flag = 2 break ''' else: print "识别正确,未找到链接" list_reg_flag.append(0) self.browser.back() self.browser.reload() ''' else: print "不存在专利" #没有专利 down_flag = 1 break else: print "识别错误,重新加载" list_reg_flag.append(0) self.browser.back() self.browser.reload() else: print "不能分割" list_fill_text.append("") list_split_flag.append(0) list_reg_flag.append(0) self.browser.reload() #存入数据集onlinecode,专利号,验证码路径,识别码,识别标志,不可分标志,时间 for code_path, fill_text, split_flag, reg_flag in zip(list_code_path,list_fill_text, list_split_flag, list_reg_flag): try: self.db.onlinecode.insert({"indexflag": patentno, "codepath": code_path, "filltext": fill_text, \ "splitflag": split_flag, "regflag": reg_flag, "time": time.ctime()}) except: pass return download_link #处理验证码 def deal_split(self, code_handler, image): list_split_image = code_handler.main_deal_split(image) return list_split_image #识别 def reg_code(self, list_split_image): all_plain_text = "0123456789abcdef" reg_plain_text = [] neural = NeuralWork() list_input_data = [] for each_split_image in list_split_image: each_input_data = [] for x in xrange(each_split_image.size[1]): for y in xrange(each_split_image.size[0]): if each_split_image.getpixel((y, x)): each_input_data.append(0) else: each_input_data.append(1) list_input_data.append(each_input_data) out = neural.reg_net(list_input_data) for each in out: plain_text = int(round(each[0] * 100)) if plain_text < 16: reg_plain_text.append(all_plain_text[plain_text]) return reg_plain_text #查找验证码图片位置 def find_location(self, image): image = image.convert("L") image_width = image.size[0] image_height = image.size[1] flag = image_width location = [0, 0] for y in xrange(image_width): for x in xrange(image_height): if image.getpixel((y, x)) != 0: flag = y break if flag != image_width: location[0] = y location[1] = x break return location
def scrape(): mars_library = {} response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') news_title = soup.find_all( 'div', class_='content_title')[0].find('a').text.strip() news_p = soup.find_all( 'div', class_='rollover_description_inner')[0].text.strip() mars_library['news_title'] = news_title mars_library['news_p'] = news_p url1 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' response1 = requests.get(url1) soup1 = BeautifulSoup(response1.text, 'html.parser') half_addy = soup1.find_all( 'a', class_='fancybox')[0].get('data-fancybox-href').strip() Big_Pic = "https://www.jpl.nasa.gov" + half_addy mars_library['featured_image_url'] = Big_Pic url2 = "https://twitter.com/marswxreport?lang=en" response2 = requests.get(url2) soup2 = BeautifulSoup(response2.text, 'html.parser') weather = soup2.find_all( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' )[0].text mars_library['mars_weather'] = weather url3 = 'https://space-facts.com/mars/' tables = pd.read_html(url3) df = tables[0] df.columns = ['Description', 'Values'] mars_facts = df.to_html(justify='left') mars_library['mars_facts'] = mars_facts url4 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' response4 = requests.get(url4) soup4 = BeautifulSoup(response4.text, 'html.parser') executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url4) html = browser.html soup4 = BeautifulSoup(html, "html.parser") results = soup4.find_all('h3') hemisphere_image_urls = [] tempdict = {} for result in results: item = result.text browser.click_link_by_partial_text(item) html1 = browser.html soup5 = BeautifulSoup(html1, "html.parser") image = soup5.find_all( 'div', class_="downloads")[0].find_all('a')[0].get("href") tempdict["title"] = item tempdict["img_url"] = image hemisphere_image_urls.append(tempdict) tempdict = {} browser.click_link_by_text('Back') mars_library['hemisphere_image_urls'] = hemisphere_image_urls return mars_library
def download(link): browser = Browser() browser.visit('https://www.ssyoutube.com'+link) time.sleep(22) print("OPENED") browser.click_link_by_text('Download')
browser.visit(url) # tell the browser to look for form data and fill in with the information we provided earlier browser.fill('username', user_name) browser.fill('password', password) # find the button and click it to submit browser.find_by_tag('button').click() # find the navbar table where e-subro is housed. when we find it, click it nav_bar = browser.find_by_id('td4') drop_down = nav_bar.click() # search for text demand search and click it. browser.click_link_by_text('Demand Search') # fill in the form 'fileno' with the claim number from earlier browser.fill('fileNo', claim_number) # find the button element and submit browser.find_by_name('btnSearch').first.click() # note this only works for trs files- olf files use different html to navigate to- use try and except to catch index errors and spit to console try: browser.click_link_by_partial_text('click here') except: print(f"This docket cannot be found, {claim_number}") continue # other wise for olf, use this # used for olf cases ---browser.click_link_by_partial_text('Docket Records')
class TwitterLiker(): # constructor def __init__(self): self.mUrl = "https://www.twitter.com/" self.cycles = 2 self.browser = Browser() self.username = "******" self.pw = 'XXXXXXXXXX\r' self.totalLikes = 0 self.userNameField = 'session[username_or_email]' self.passwordField = 'session[password]' self.loginButtonId = 'submit btn primary-btn js-submit' # scroll the page and # do the liking def launchPage(self): self.browser.visit(self.mUrl) self.login() # self.scrollBy() for i in range(0, self.cycles): self.likePosts() print(str(self.totalLikes) + " total likes this session...Yay!") def login(self): print("login") print("logging in as " + self.username) self.browser.click_link_by_text('Log in') # time.sleep(1) assert self.browser.find_by_name(self.userNameField) self.browser.fill(self.userNameField, self.username) self.browser.fill(self.passwordField, self.pw) inputs = self.browser.find_by_tag('input') for foo in inputs: if foo['class'] == self.loginButtonId: foo.click() print('clicked the log in button') # need to sleep a few seconds here time.sleep(3) def likePosts(self): print("liking posts") buttonList = self.browser.find_by_tag('button') time.sleep(2) buttonList = self.browser.find_by_tag('button') likeList = 0 time.sleep(1) for b in buttonList: if 'title="Like"' in b['innerHTML']: #check if it's visible, if not move on if b.visible: b.click() self.totalLikes += 1 likeList += 1 print("just liked " + str(likeList) + " tweets.") self.scrollBy() time.sleep(1) def scrollBy(self): print("scrolling down.") # print( self.browser.execute_script( "window.scrollY" )) self.browser.execute_script( "window.scrollBy(0,30000);" ) time.sleep(2) def boneyard(self): print('boneyard')
def economic_benefits(): # initialize browser executable_path = {'executable_path': 'chromedriver.exe'} # use executable path below for mac # executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) print('COMMENCING DATA SCRAPE FOR ECONOMIC BENEFITS INFO') client.yosemite_db.economic_benefits.drop() # URL of yosemite articles page to be scraped url = 'https://www.nps.gov/yose/learn/news/newsreleases.htm' browser.visit(url) time.sleep(2) # empty lists to hold raw scraped data article_links = [] headlines = [] article_contents = [] # empty lists that will hold cleaned scraped data years = [] amounts = [] job_counts = [] visitor_counts = [] # empty list to hold final scraped data economic_benefits = [] # go through pages 1-33 and find links of targeted articles for x in range(1, 34): html = browser.html soup = BeautifulSoup(html, 'html.parser') article_snippets = soup.find_all('li', class_='ListingList-item') substring = 'Economic Benefit' for article_snippet in article_snippets: snippet_headline = article_snippet.find( 'h3', class_='ListingResults-title').text if substring in snippet_headline: end_link = article_snippet.find('a')['href'] article_link = 'https://www.nps.gov' + end_link article_links.append(article_link) browser.click_link_by_text('Next ') time.sleep(1) # visit each article link and extract content for article_link in article_links: browser.visit(article_link) article_html = browser.html article_soup = BeautifulSoup(article_html, 'html.parser') headline = article_soup.find('div', class_='ContentHeader').text headline = headline.replace('\n', '') headlines.append(headline) article_content = article_soup.find('div', class_='ArticleTextGroup').text article_contents.append(article_content) # loop through headlines and extract economic benefit $ amount (in millions) for headline in headlines: headline_split = headline.split('$')[1] amount = headline_split[:3] amounts.append(amount) # loop through article contents and extract year, job count, and visitor count for article_content in article_contents: year_split = article_content.split('Park in ')[1] year = year_split[:4] years.append(year) job_split = article_content.split('supported ')[1] job_count = job_split[:5] if ',' in job_count: job_count = job_count.replace(',', '') job_counts.append(job_count) elif ' ' in job_count: job_count = job_count.replace(' ', '') job_counts.append(job_count) else: job_counts.append(job_count) visitor_split = article_content.split('shows that')[1] visitor_count = visitor_split[:10] visitor_count = visitor_count.replace(',', '').replace('\xa0', '').replace(' ', '') visitor_counts.append(visitor_count) # append extract information into economic_benefits dictionary economic_benefits.append({ 'years': years, 'amounts': amounts, 'job_counts': job_counts, 'visitor_counts': visitor_counts }) # append missing 2015 data economic_benefits[0]['years'].insert(2, '2015') economic_benefits[0]['amounts'].insert(2, '594') economic_benefits[0]['job_counts'].insert(2, '6890') economic_benefits[0]['visitor_counts'].insert(2, '4150217') economic_benefits_collection = client.yosemite_db.economic_benefits economic_benefits_collection.update({}, economic_benefits[0], upsert=True) print('OBTAINED ECONOMIC BENEFITS') browser.quit() return economic_benefits print('-------------------------------------------------------')
import config from splinter import Browser browser = Browser() browser.visit('https://studentemployment.neu.edu/tsx_studentjobs.aspx') browser.fill('Skin$ctl08$LoginNameText', config.username) browser.fill('Skin$ctl08$LoginPasswordText', config.password) browser.find_by_name('Skin$ctl08$ctl14').click() browser.click_link_by_text(config.jobTitle) browser.find_link_by_text('Go to time sheet').first.click() # browser.find_link_by_text('Start time sheet').first.click() # alert = browser.get_alert() # alert.accept() def addShift(shift): browser.click_link_by_text('Add New Entry') browser.find_by_id('Skin_body_ctl01_WDL').find_by_css('option')[ shift.day].click() browser.find_by_id('Skin_body_ctl01_StartDateTime1').select(shift.start) browser.find_by_id('Skin_body_ctl01_EndDateTime1').select(shift.end) browser.find_by_value('Add').first.click() for shift in config.shifts:
def scrape(): # A webscraping function for the latest news on mars # Python dictionary of the results scrape_rsult = {} # ### NASA Mars News # In[2]: # *** Scrape the [NASA Mars News Site] *** url_NASA = "https://mars.nasa.gov/news" r = req.get(url_NASA) # sends a request to the url time.sleep(1) data = r.text # turns response into texts soup = BeautifulSoup( data, "html.parser") # changes the response from text to html # In[3]: # collect the latest News Title and Paragragh Text. Assign the text to variables that you can reference later. soup_div = soup.find( class_="slide") # within div in body, within <ul>, <li class=slide>. soup_news = soup_div.find_all('a') # search by anchor # In[4]: #getting the title NASA_latest_t = soup_news[1].get_text().strip() # ^^^Latest News Title scrape_rsult["Nasa_latest_title"] = NASA_latest_t # In[5]: #getting the paragraph # getting the paragraph url soup_p = soup_div.find_all('a', href=True) soup_p_url = soup_p[0]['href'] # only the url of latest news article's paragraph # In[6]: # Scrape the href of the first news article url = "https://mars.nasa.gov/" news_url = url + soup_p_url # request url r = requests.get(news_url) time.sleep(1) data = r.text soup = BeautifulSoup(data, "html.parser") soup_para = soup.find(class_='wysiwyg_content') soup_para = soup_para.find_all('p') # In[7]: # save the text of the paragraphs to a list NASA_latest_p = [] for entry in soup_para: paragraph = entry.get_text().strip() NASA_latest_p.append(paragraph) # ^^^ NASA_latest_p is list of paragraphs from the latest news article scrape_rsult["Nasa_latest_paragraph"] = NASA_latest_p # ### JPL Mars Space Images - Featured Image # In[8]: # Visit the url for JPL's Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars). executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(3) # In[9]: # Use splinter to navigate the site and find the image url for the current Featured Mars Image # the mars featured images are under a list element of the slide class. '>' signifies a child element. browser.find_by_css('li.slide>a.fancybox').first.click() time.sleep(1) # clicks the 'more info' button (caution!: the 'share' button is under a similar but different class) browser.find_by_css('div.buttons>a.button').first.click() time.sleep(1) # In[10]: # assign the url string to a variable called `featured_image_url`. # Here, I decide to get both the full-size .jpg and an 800x600 size image for the webpage html = browser.html soup = BeautifulSoup(html, "html.parser") # full-size jpg (to be linked if image is clicked) feat_full_img_soup = soup.find(class_="main_image") feat_full_img = feat_full_img_soup.get('src') # smaller size jpg (to be displayed on the webpage) # uses splinter instead of beautiful soup browser.click_link_by_partial_href('800x600.jpg') # switch over to the next browser (window no. 2) # save it's url, then close 2nd window browser.windows.current = browser.windows[1] featured_image_url = browser.url browser.windows[1].close() # save the two urls ori_url = 'https://www.jpl.nasa.gov' feat_full_img = ori_url + feat_full_img # ^^^ feat_full_img is https://www.jpl.nasa.gov + url of the full-sized featured image # featured_image_url is the smaller 800x600 image that will be featured on the webpage scrape_rsult["featured_image_url"] = featured_image_url scrape_rsult['feat_full_img'] = feat_full_img # ### Mars Weather # In[11]: ''' *** Visit the Mars Weather twitter account (https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called `mars_weather`. *** ''' url = 'https://twitter.com/marswxreport?lang=en' r = requests.get(url) time.sleep(1) data = r.text soup = BeautifulSoup(data, 'html.parser') mars_tweets = soup.find(class_='stream-items js-navigable-stream') mars_tweets = mars_tweets.find(class_="js-tweet-text-container") mars_weather = mars_tweets.p.text # ^^^ mars_weather is the paragraph <p> text of the latest tweet from the Mars weather handle scrape_rsult["mars_weather_tweet"] = mars_weather # ### Mars Facts # In[12]: ''' *** Visit the Mars Facts webpage (http://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. *** ''' facts_url = 'http://space-facts.com/mars/' all_facts_df = pd.read_html( facts_url) # searches for html tables & returns list of dataframes all_facts_df = all_facts_df[0] # In[14]: # Use Pandas to convert the data to a HTML table string. facts_html = all_facts_df.to_html(header=False, index=False, justify='left') # ^^^ facts_html is the html table of the mars facts table scrape_rsult["mars_facts_table"] = facts_html # ### Mars Hemispheres # In[114]: ''' *** Visit the USGS Astrogeology site (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres. ''' url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) time.sleep(3) # In[115]: # click each of the links to the hemispheres to find the image url to the full resolution image. # old code, may be useful later ''' # get list of <a href links> html = browser.html soup = BeautifulSoup(html, 'html.parser') hemi_soup = soup.find_all(class_='itemLink product-item') hemi_href_ls = [] for item in hemi_soup: url_index = 'https://astrogeology.usgs.gov' href = item['href'] link = url_index + href hemi_href_ls.append(link) ''' # Get unique hrefs ''' I could just go to these urls separately using browser.visit(url). But I interpret the instructions as saying that I need to use splinter to click on the link in the browser. ''' # hemi_href_ls = np.unique(hemi_href_ls) # hemi_href_ls # In[116]: ''' Caution!: It seems splinter can only click link based on the exact wording of the text browser.click_link_by_partial_text('Cerberus Hemisphere') #e.g. function will fail to find lower case 'cerberus' ''' # In[117]: # Beautiful soup to search browser html for headers (these contain the hemisphere names) html = browser.html soup = BeautifulSoup(html, 'html.parser') headers_soup = soup.find_all('h3') #test = headers_soup[2].text.replace(" Enhanced", "") #test # In[128]: # For each header in the beautiful soup, click link associated with it and get img_url hemisphere_image_urls = [] url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' for header in headers_soup: #start at origin url for the Mars hemisphere section window = browser.windows[0] # current window, the first window browser.visit(url) time.sleep(2) # wait 2 secs for browser to load #getting title title = header.text title = title.replace( " Enhanced", "") #get rid of " " + "Enhanced" for when dict is appended browser.click_link_by_partial_text(title) time.sleep(2) # again, wait 2 secs for browser to load browser.click_link_by_text('Sample') browser.windows.current = browser.windows[ 1] # switch current window to the window that just opened img_url = browser.url browser.windows.current = window # switch the current window back hemisphere_image_urls.append({'title': title, 'img_url': img_url}) window.close_others( ) # close all the other windows to keep browser nice and tidy! # ^^^ hemisphere_image_urls is list of dicts of img_url and title of hemisphere scrape_rsult["hemispheres"] = hemisphere_image_urls return scrape_rsult
def getRoutes(start,end): browser = Browser( driver_name="firefox" ) browser.visit('https://www.hopstop.com/search?xfr=cityscape') print(browser.url) browser.fill('address1',str(start)) browser.fill('address2',str(end)) browser.find_by_name('get_dirs').click() print(browser.url) if browser.is_text_present('Did you mean?'): print "better at least get here" #browser.click_link_by_href("#") for link in browser.find_link_by_href("#"): print "Okay" if link.visible == True: print link.text browser.click_link_by_text(link.text) break browser.click_link_by_href("#") links = browser.find_link_by_partial_href("/station?stid") results = [] for link in links: results.append(link.value) browser.quit() return results