class LoginTestCase(unittest.TestCase): def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_datastore_v3_stub() self.testbed.init_memcache_stub() self.browser = Browser('chrome') def tearDown(self): self.testbed.deactivate() def test_login(self): self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") def test_logout(self): self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") self.browser.find_link_by_text("Log out").first.click() self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
def doScrape(): executable_path = {'executable_path': './chromedriver'} browser = Browser('chrome', **executable_path, headless=False) #Lookin at the USGS webpage titles = list() img_links = list() browser.visit(usgs_url) html = browser.html soup_usgs = BeautifulSoup(html, 'html.parser') results = soup_usgs.find_all('a', class_='itemLink product-item') for result in results: title = result.find('h3') if (title != None): titles.append(title.text) for title in titles: browser.visit(usgs_url) browser.click_link_by_partial_text(title) browser.find_link_by_text('Sample').click() img_links.append(browser.windows[1].url) browser.windows[1].close() hemisphere_image_urls = [] for x in range(len(titles)): hemisphere_image_urls.append({"title": titles[x], "img_url": img_links[x]}) #browser.quit() print(hemisphere_image_urls) return hemisphere_image_urls
def enable(): import time import requests import settings from splinter import Browser from xvfbwrapper import Xvfb print "Trying to enable myself." vdisplay = Xvfb() vdisplay.start() email = settings.getEmail() password = settings.getPassword() team_name = settings.getTeamName() bot_user = settings.getBotUser() browser = Browser('chrome') url = 'https://{}.slack.com/services/{}'.format(team_name, bot_user) browser.visit(url) browser.fill('email', email) browser.fill('password', password) browser.find_by_id('signin_btn').first.click() browser.find_link_by_text('Enable').first.click() time.sleep(2) # Sometimes I saw a crash where there was no alert, so we'll wait a bit first. alert = browser.get_alert() alert.accept() time.sleep(2) # If you close the display too quickly, the request doesn't get processed. vdisplay.stop()
def hemisphere(): # Visit the USGS Astrogeology Science Center Site executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) hemi_img_urls = [] # Get a List of All the Hemispheres links = browser.find_by_css("a.product-item h3") for item in range(len(links)): hemisphere = {} browser.find_by_css("a.product-item h3")[item].click() # Find Sample Image Anchor Tag & Extract <href> sample = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample["href"] # Get Hemisphere Title hemisphere["title"] = browser.find_by_css("h2.title").text # Append List hemi_img_urls.append(hemisphere) # Navigate Backwards browser.back() hemi_img_urls return {'Hemisphere Image URLS': hemi_img_urls}
def hemisphere(browser): executable_path = {"executable_path": "./chromedriver.exe"} browser = Browser("chrome", **executable_path) # Visit the USGS Astrogeology Science center site url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) hemisphere_image_urls = [] # Get a list of all the hemisphere links = browser.find_by_css("a.product-item h3") for item in range(len(links)): hemisphere = {} # Find element on each loop to avoid stale element exception browser.find_by_css("a.product-item h3")[item].click() # Find sample image anchor tag & extract href sample_element = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] # Get hemisphere title hemisphere["title"] = browser.find_by_css("h2.title").text # Append hemisphere object to list hemisphere_image_urls.append(hemisphere) # Navigate back browser.back() return hemisphere_image_urls
def submitQueue(NETID, PASSWORD, SECURITY_QUESTIONS): browser = Browser() # netid page browser.visit("https://puaccess.princeton.edu/psp/hsprod/EMPLOYEE/HRMS/h/?tab=DEFAULT") browser.fill('userid', NETID) browser.find_by_value("Continue").first.click() # password page browser.fill('Bharosa_Password_PadDataField', PASSWORD) browser.evaluate_script("Bharosa_Password_Pad.keyPress('ENTERKEY');") # security question page html = browser.html for key in SECURITY_QUESTIONS.keys(): if key in html: answer = SECURITY_QUESTIONS[key] browser.fill('Bharosa_Challenge_PadDataField', answer) browser.evaluate_script("Bharosa_Challenge_Pad.keyPress('ENTERKEY');") time.sleep(2) # welcome to SCORE browser.find_link_by_text("Student Center").first.click() # student center, start by busting out of the iframe browser.visit("https://puaccess.princeton.edu/psc/hsprod/EMPLOYEE/HRMS/c/SA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL?PORTALPARAM_PTCNAV=HC_SSS_STUDENT_CENTER&EOPP.SCNode=HRMS&EOPP.SCPortal=EMPLOYEE&EOPP.SCName=ADMN_SCORE&EOPP.SCLabel=&EOPP.SCPTcname=ADMN_SC_SP_SCORE&FolderPath=PORTAL_ROOT_OBJECT.PORTAL_BASE_DATA.CO_NAVIGATION_COLLECTIONS.ADMN_SCORE.ADMN_S200801281459482840968047&IsFolder=false&PortalActualURL=https%3a%2f%2fpuaccess.princeton.edu%2fpsc%2fhsprod%2fEMPLOYEE%2fHRMS%2fc%2fSA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL&PortalContentURL=https%3a%2f%2fpuaccess.princeton.edu%2fpsc%2fhsprod%2fEMPLOYEE%2fHRMS%2fc%2fSA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL&PortalContentProvider=HRMS&PortalCRefLabel=Student%20Center&PortalRegistryName=EMPLOYEE&PortalServletURI=https%3a%2f%2fpuaccess.princeton.edu%2fpsp%2fhsprod%2f&PortalURI=https%3a%2f%2fpuaccess.princeton.edu%2fpsc%2fhsprod%2f&PortalHostNode=HRMS&NoCrumbs=yes&PortalKeyStruct=yes") browser.select('DERIVED_SSS_SCL_SSS_MORE_ACADEMICS', "1005") browser.find_by_id("DERIVED_SSS_SCL_SSS_GO_1").first.click() # pick semester browser.choose("SSR_DUMMY_RECV1$sels$0", "1") browser.find_by_id("DERIVED_SSS_SCT_SSR_PB_GO").first.click() # select classes to add... class should already be in queue browser.find_by_id("DERIVED_REGFRM1_LINK_ADD_ENRL$115$").first.click() # confirm classes browser.find_by_id("DERIVED_REGFRM1_SSR_PB_SUBMIT").first.click()
def make(b,c): browser=Browser('chrome') url='http://admin2.okzaijia.com.cn/Account/login' browser.visit(url) browser.find_by_id('UserName').fill('Tina') browser.find_by_id('Password').fill('13916099416') browser.find_by_id('LoginOn').click() browser.find_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div/div/ul/li/a').click() if b==1: browser.find_link_by_text(u'新增订单').click() browser.windows.current=browser.windows[1] #print browser.windows.current textnew=browser.find_by_name('RepairContent') textnew.fill(random.randint(10000,19999)) a=''.join([chr(random.randint(97,122)) for _ in range(4)]) browser.find_by_id('UserName').fill(a) browser.find_by_id('UserMobile').fill(random.randint(15138460867,19000000000)) browser.select('Source',random.randint(1,10)) browser.select('AreaId',random.randint(801,819)) browser.find_by_id('UserAddress').fill(random.randint(3000,9999)) browser.find_by_xpath('//*[@id="submit"]').click() time.sleep(2) else: browser.find_by_name('orderno').fill(c) browser.find_by_xpath('//*[@id="searchForm"]/div[7]/button').click() browser.find_by_text(u'维修记录').click() browser.find_by_xpath("/html/body/div[1]/div[1]/div/div[2]/div[1]/a").click() browser.windows.current=browser.windows[1] b=''.join([chr(random.randint(97,122)) for _ in range(5)]) browser.find_by_name('RepairContent').fill(b) browser.find_by_name('Remark').fill(random.randint(20000,29999)) browser.find_by_id('submit').click() time.sleep(3) browser.visit('http://admin2.okzaijia.com.cn/Task/MyTask?TaskType=4&Status=1') browser.windows.current=browser.windows[1] #print browser.windows.current browser.find_by_xpath('//*[@id="searchForm"]/div[3]/button').click() browser.find_by_xpath('//*[@id="pages"]/div/a[7]').click() browser.find_by_text(u'执行任务').last.click() time.sleep(2) browser.windows.current=browser.windows[2] browser.find_by_value('37').click()#选择接单的施工组 #print browser.find_by_value('17').text browser.find_by_id('submit').click()
def scrape(): results={} executable_path = { 'executable_path': 'C:\p\HomeWork\Web-Scraping-Challenge\Mission_to_Mars\chromedriver.exe' } browser = Browser('chrome',**executable_path) url='https://mars.nasa.gov/news/' browser.visit(url) browser.is_element_present_by_css('ul.itme_list',wait_time=2) soup= BeautifulSoup(browser.html) title=soup.find('div','content_title').get_text() news_p=soup.find('div','article_teaser_body').get_text() results['news_title']=title results['news_paragraph']=news_p # 2. jpl.nasa.gov/spaceimages url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) full_image_btn=browser.find_by_id('full_image') full_image_btn.click() browser.is_element_present_by_text('more info', wait_time=1) more_info_btn = browser.find_link_by_partial_text('more info') more_info_btn.click() soup=BeautifulSoup(browser.html) img_url_rel=soup.select_one('figure.lede a img').get('src') img_url=f'http://www.jpl.nasa.gov{img_url_rel}' results['featured_images']= img_url #3 table from space-facts.com/mars/ df = pd.read_html('https://space-facts.com/mars/') [0] df.columns=['description','value'] df.set_index('description',inplace=True) results['facts']= df.to_html(classes='table table_striped') #4 hemisheres images from astrology.usgs.gov url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemisheres=[] links=browser.find_by_css('a.product-item h3') for i in range(len(links)): hemi={} browser.find_by_css('a.product-item h3')[i].click() sample_elm=browser.find_link_by_text('Sample').first img_url=sample_elm['href'] title=browser.find_by_css('h2.title').text hemi['title']=title hemi['img_url']=img_url hemisheres.append(hemi) browser.back() results['hemispheres']=hemisheres return(results)
class Retreiver(): def __init__(self, folder): self.aux = Auxiliary() self.folder = folder self.tickers = None def click(self, destination): try: self.browser.find_by_text(destination).first.click() except splinter.exceptions.ElementDoesNotExist: self.browser.click_link_by_text(destination) def retreive(self): print ('Please enter the period for retrieval.') raw_dates = input ('Dates in European format: dd/mm/yyyy\n>') eurodates = self.aux.date_parse(raw_dates)[0] dates = self.aux.european_dates_to_american(eurodates) raw_tickers = input ('Tickers:\n>') self.tickers = self.aux.parse_tickers(raw_tickers) self.browser = Browser('chrome') for ticker in self.tickers: self.browser.visit('https://beta.finance.yahoo.com/quote/%s/history' % ticker) time.sleep(5) input_boxes = self.browser.find_by_tag('input') for i in range(0,6): input_boxes[i+2].fill(dates[i]) #we need 3-8 inputs self.click('Apply') download_link = self.browser.find_link_by_text('Download data').first response = requests.get(download_link['href']) with open('%s//%s.csv' % (self.folder, ticker), 'wb') as f: f.write(response.content) self.browser.quit() def put_together(self): if not self.tickers: self.tickers = [] for f in os.listdir(self.folder): self.tickers.append(f[:-4]) target = openpyxl.Workbook() sheet = target.active sheet.append(self.tickers) for filename in os.listdir(self.folder): source = open('%s//%s' %(self.folder, filename), 'r', encoding='utf-8') sheet = target.create_sheet() sheet.title = filename[:-4] #strip out the extension for line in source: sheet.append(self.aux.parse_comma_separated_line(line)) source.close() target.save('Historical_data_together.xlsx')
def hemispheres(): astropedia_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser = Browser('chrome') browser.visit(astropedia_url) links = browser.find_by_css("a.product-item h3") hemisphere_urls = [] for i in range(len(links)): hemisphere = {} browser.find_by_css("a.product-item h3")[i].click() sample = browser.find_link_by_text('Sample').first hemisphere['title'] = browser.find_by_css('h2.title').text hemisphere['image_url'] = sample['href'] # title = browser.find_by_css('h2.title').text # hemisphere[title] = original['href'] hemisphere_urls.append(hemisphere) browser.back() return hemisphere_urls
def mars_hemis(browser): # Set the executable path and initialize the chrome browser in splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path) #html = browser.html #hemi_soup = BeautifulSoup(html, 'html.parser') #create list for Hemisphere content hemi_list = [] for i in range (0,4): #(len(hemi_titles_list)): # visit splash page url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Get the hemisphere title with splinter hemi_titles_list = browser.find_by_css('.description .itemLink') hemi_title = hemi_titles_list[i].value #print(hemi_title) # Parse the resulting html with soup hemi_splinter_link = browser.find_by_css('div.description a.itemLink') hemi_splinter_links = hemi_splinter_link[i]["href"] # generate new url and visit page browser.visit(hemi_splinter_links) # scrape site for high-res image #img_soup = BeautifulSoup(html, 'html.parser') sample_example = browser.find_link_by_text('Sample').first sample_img = sample_example["href"] #print(sample_img) # create dictionary pair hemi_pair = {'Title': hemi_title, 'Image': sample_img} #print(hemi_pair) # append hemi_list with new dictionary content hemi_list.append(dict(hemi_pair)) if i == 3: return (hemi_list)
def retrieve_hemispheres(): browser = Browser("chrome",headless=True) browser.visit(source_urls['hemispheres']) browser.click_link_by_partial_text('Enhanced') browser.click_link_by_partial_text('Back') hemisphere_links = browser.find_link_by_partial_text('Hemisphere') link_text = [] for link in hemisphere_links: link_text.append(link.text) hemisphere_image_urls = [] for link in link_text: browser.click_link_by_partial_text(link) hemisphere_image_urls.append({ 'title' : link[:-9], 'tif_url' : browser.find_link_by_partial_text('Original')['href'], 'jpg_url' : browser.find_link_by_text('Sample')['href'], }) browser.click_link_by_partial_text('Back') return hemisphere_image_urls
def scrap_hemisphereInfo(): from splinter import Browser from bs4 import BeautifulSoup # get branch links and name: browser = Browser('chrome', headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, "html.parser") # loop to spiders the name and links info title_names = soup.find_all("div", class_ = "description") titles = [] branch_links = [] for title_name in title_names: # loop get the links ready link = title_name.a['href'] href = f"https://astrogeology.usgs.gov/{link}" branch_links.append(href) # get the names ready name = title_name.h3.text titles.append(name.replace(" Enhanced", "").strip()) hemisphere_image_urls = [] dictt = {} for i in range(len(branch_links)): browser = Browser('chrome', headless=False) url = branch_links[i] browser.visit(url) full_image_link = browser.find_link_by_text("Sample") image_link = full_image_link['href'] dictt['title'] = titles[i] dictt['image_url'] = image_link hemisphere_image_urls.append(dictt) return hemisphere_image_urls
def hemisphere(): executable_path = {"executable_path": (r"C:\Users\Mickey\anaconda3\Scripts\chromedriver.exe")} browser = Browser("chrome", **executable_path, headless=False) #Browse URL url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) #Create an Empty List to store Result hemi_img_urls = [] #Get a List of Hemispheres products = browser.find_by_css("a.product-item h3") #Begin For Loop for item in range(len(products)): hemisphere = {} browser.find_by_css("a.product-item h3")[item].click() time.sleep(1) #Find Sample Image sample_element = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] #Get Hemisphere Title hemisphere["title"] = browser.find_by_css("h2.title").text #Add to List hemi_img_urls.append(hemisphere) #Navigate to Previous Page browser.back() #Close Broswer when done browser.quit() #Return Results return hemi_img_urls
def hemisphere_image(browser): # Visit web page browser = Browser('chrome', headless=False) url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) hemisphere_image_urls = [] links = browser.find_by_css("a.product-item h3") for item in range(len(links)): hemisphere = {} browser.find_by_css("a.product-item h3")[item].click() sample_element = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] hemisphere["title"] = browser.find_by_css("h2.title").text hemisphere_image_urls.append(hemisphere) browser.back() # browser.quit() return hemisphere_image
# In[38]: length_loop=len(title_array) hemisphere_image_urls=[] for marker in range(length_loop): temp_dict={'title': title_array[marker], 'img_url':list_image_urls[marker]} hemisphere_image_urls.append(temp_dict) # In[39]: hemisphere_image_urls # In[40]: links_found = browser.find_link_by_text('Link for Example.com') return Mars_data
hemi_dicts = [] for i in range(1,9,2): hemi_dict = {} browser.visit(mars_hemisphere_url) time.sleep(1) hemispheres_html = browser.html hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser') hemi_name_links = hemispheres_soup.find_all('a', class_='product-item') hemi_name = hemi_name_links[i].text.strip('Enhanced') detail_links = browser.find_by_css('a.product-item') detail_links[i].click() time.sleep(1) browser.find_link_by_text('Sample').first.click() time.sleep(1) browser.windows.current = browser.windows[-1] hemi_img_html = browser.html browser.windows.current = browser.windows[0] browser.windows[-1].close() hemi_img_soup = BeautifulSoup(hemi_img_html, 'html.parser') hemi_img_path = hemi_img_soup.find('img')['src'] print(hemi_name) hemi_dict['title'] = hemi_name.strip() print(hemi_img_path) hemi_dict['img_url'] = hemi_img_path
def scrape_all(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url ='https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') dic={} #Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title #and Paragraph Text. Assign the text to variables that you can reference later. html = browser.html soup = BeautifulSoup(html, 'html.parser') title= soup.find_all('div', class_='content_title') body= soup.find('div', class_='article_teaser_body') print(title[1].text) print(body.text) news_title=title[1].text news_p= body.text dic[news_title]=news_title dic[news_p]=news_p dic #browser.quit() # JPL Mars Space Images - Featured Image image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(image_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.find_by_id('full_image').click() browser.find_link_by_partial_text('more info').click() html = browser.html soup = BeautifulSoup(html, 'html.parser') imgs=soup.find('figure', class_='lede') print(imgs) print(imgs.a) print(imgs.a.img) print(imgs.a.img['src']) featured_image_url='https://www.jpl.nasa.gov'+imgs.a.img['src'] print(featured_image_url) dic[featured_image_url]=featured_image_url dic # Mars Facts url ='https://space-facts.com/mars/' facts=pd.read_html(url) facts type(facts) df=facts[0] df.columns=['Profile','Values'] df.set_index('Profile', inplace=True) df.head() html_facts = df.to_html() html_facts #strip unwanted newlines to clean up the table. html_facts.replace('\n', '') df.to_html('facts.html') #Mars Hemispheres executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) image_urls= [] imgs = browser.find_by_css("a.product-item h3") imgs # For loop for i in range(len(imgs)): hemisphere = {} browser.find_by_css("a.product-item h3")[i].click() # Find Sample Image sample_element = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] # Get the Title hemisphere["title"] = browser.find_by_css("h2.title").text # Append image_urls.append(hemisphere) # find imgs back browser.back() image_urls dic['hemisphere']=image_urls dic return dic
def scrape(): browser = init_browser() ##### __NASA Mars News__ ##### # URL of page to be scraped url = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(response.text, 'html.parser') # Collect the latest News Title assign the text to a variable that can be referenced later. news_title = soup.find_all('div', class_='content_title')[0].text # Collect the latest paragragph and assign the text to a variable that can be referenced later. news_p = soup.find_all('div', class_='rollover_description_inner')[0].text # Close the browser after scraping browser.quit() #### __JPL Mars Space Images - Featured Image__ #### browser = init_browser() # Setup Splinter executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # Set up browser to connect to url and scrape url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html' browser.visit(url) # Click on FULL IMAGE button browser.links.find_by_partial_text('FULL IMAGE').click() # Create Browser and BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Delay code to allow link to open before trying to scrape time.sleep(1) # Scrape page to find the featured Mars image mars_image = soup.find('img', class_='fancybox-image') url = mars_image['src'] featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + url # Close the browser after scraping browser.quit() ##### __Mars Facts__ ##### browser = init_browser() # Use Pandas to scrape the table and convert the data to a HTML table string url = 'https://space-facts.com/mars/' mars_table = pd.read_html(url) mars_data_df = mars_table[0] mars_html_table = mars_data_df.to_html(classes='table table-striped' 'table-bordered', index=False, header=False, border=1) # #Close the browser after scraping browser.quit() ##### __Mars Hemispheres__ ##### browser = init_browser() # Setup splinter executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # Set up browser to connect to url to scrape url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Setup empty list hemisphere_image_urls = [] # Get list of hemispheres for i in range(4): hemisphere = {} time.sleep(1) # Click on each hemispher enhanced link browser.find_by_css("a.product-item h3")[i].click() # Scrape page to find Hemisphere title hemisphere["title"] = browser.find_by_css("h2.title").text # Locate sample jpg image & scrape url sample_element = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] # download = soup.find('div', class_ = 'downloads') # image_url = download.ul.li.a["href"] # hemisphere["image_url"] = image_url # Add data to hemisphere dictionary hemisphere_image_urls.append(hemisphere) # Navigate back to Products page to continue through range browser.back() # Close the browser after scraping browser.quit() # Python dictionary containing all of the scraped data. mars_data = { "news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, "mars_html_table": mars_html_table, "hemisphere_image_urls": hemisphere_image_urls } # Close remaing browser browser.quit() # Return results return mars_data
class UploadTestCase(unittest.TestCase): def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_datastore_v3_stub() self.testbed.init_memcache_stub() self.browser = Browser('chrome') def tearDown(self): self.testbed.deactivate() def test_when_create_task_upload_file(self): #login self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") self.browser.visit("http://127.0.0.1:8080/tasks") self.browser.click_link_by_text('Create new task') self.browser.fill('title', 'title') self.browser.fill('text', 'text') self.browser.is_element_present_by_name('files[]', wait_time=10) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) #self.browser.attach_file('files[]', 'test/1.png') self.browser.find_by_css('.btn.btn-primary.start').first.click() self.assertEqual(1, len(self.browser.find_by_css('.template-download.fade.in'))) self.assertEqual(4, len(self.browser.find_by_css('.template-download.fade.in td'))) def test_when_create_task_upload_many_files(self): #login self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") self.browser.visit("http://127.0.0.1:8080/tasks") self.browser.click_link_by_text('Create new task') self.browser.fill('title', 'title') self.browser.fill('text', 'text') self.browser.is_element_present_by_name('files[]') self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) #self.browser.attach_file('files[]', 'test/1.png') self.browser.find_by_css('.btn.btn-primary.start').first.click() sleep(3) self.assertEqual(3, len(self.browser.find_by_css('.files tr.template-download')))
button.click() # Click OK break # --Download page reached-- format = browser.find_by_id('downloadFormatMenu0').first # Open download format chooser format.click() # Switch to your desired download format for a in browser.find_by_tag('a'): if dl_format+' -' in a.value: a.click() print('Switching to '+dl_format+' format.') break # Print format being used format = browser.find_by_id('downloadFormatMenu0').first print('Format: '+format.value) # Wait while the download is being prepared... print('Preparing download.') while browser.is_text_present('preparing'): time.sleep(5) # Grab final download link downloadLink = browser.find_link_by_text('Download').first print('Got download link! Starting download...') url = downloadLink['href'] file_name = wget.download(url) # Download the link using wget # Repeat for other albums in the list
def scrape(): # browser = init_browser() browser = Browser('chrome') #Visit the URL Nasa_news_url = 'https://mars.nasa.gov/news/' browser.visit(Nasa_news_url) html = browser.html #Parse HTML with Beautiful Soup soup_nasa = BeautifulSoup(html, 'html.parser') type(soup_nasa) ### NASA Mars News #<div class="content_title"><a href="/news/8782/sensors-on-mars-2020-spacecraft-answer-long-distance-call-from-earth/" target="_self"> #Sensors on Mars 2020 Spacecraft Answer Long-Distance Call From Earth</a></div> #<div class="article_teaser_body">Instruments tailored to collect data during the descent of NASA's next rover through the Red Planet's atmosphere have been checked in flight.</div> #news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text news_titles = soup_nasa.find_all('div', class_="content_title")[0].text news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text print(news_titles) print('------------------') print(news_paragraphs) ### JPL Mars Space Images - Featured Image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(5) #print(soup.prettify()) #go to the full image #data-fancybox-href image = browser.find_by_id('full_image') image.click() time.sleep(5) browser.click_link_by_partial_text('more info') html = browser.html soup = BeautifulSoup(html, 'html.parser') url_image_find = soup.find('img', class_='main_image').get("src") featured_image_url = 'https://www.jpl.nasa.gov' + url_image_find featured_image_url ### Mars Facts url = 'https://space-facts.com/mars/' mars_facts_df = pd.read_html('https://space-facts.com/mars/')[2] mars_facts_df mars_facts_df.columns = ["Details", "Measures"] mars_facts_df mars_facts_df = mars_facts_df.to_html() mars_facts_df ### Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)' html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.visit(url) web_links = browser.find_by_css("a.product-item h3") len(web_links) web_list = [] for i in range(len(web_links)): web_hemispheres = {} browser.find_by_css("a.product-item h3")[i].click() web_hemispheres["link"] = browser.find_link_by_text( 'Sample').first["href"] web_hemispheres["Title"] = browser.find_by_css('h2.title').text web_list.append(web_hemispheres) browser.back() web_list browser.quit()
class TestViews(unittest.TestCase): def setUp(self): """Test setup """ #define browser instance self.browser = Browser("phantomjs") #Set up the tables in the database Base.metadata.create_all(engine) #Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def test_login_correct(self): #navigate to demo website self.browser.visit("http://127.0.0.1:8080/login") #enter user name and password in their fields self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") #define Log in button button = self.browser.find_by_css("button[type=submit]") #click on the Log in button button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") def test_login_incorrect(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def test_logout(self): #navigate to demo log in website self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() #confirm return to home page self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") #navigate to demo log out website self.browser.visit("http://127.0.0.1:8080/logout") #confirm log out link exists logout_link= self.browser.find_link_by_text("Log out") #confirm return to log in page self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login") def test_add_entry_edit(self): self.browser.visit("http://127.0.0.1:8080/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") # check add entry link exists self.browser.visit("http://127.0.0.1:8080/entry/add") first_found = self.browser.find_by_name("title").first last_found = self.browser.find_by_name("content").last button = self.browser.find_by_css("button[type=submit]") self.browser.visit("http://127.0.0.1:8080/entry/edit") self.browser.find_by_name("title") self.browser.find_by_name("content") # self.browser.find_by_value("entry_title").first why is splinter not recognising flask format in html #self.browser.find_by_value("entry_content").last button = self.browser.find_by_css("button[type=submit]") #self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") this gives error # all tests are running ok but I noticed that get/entry/edit gave a 404. Why? #test entry author is th person editing and is logged in def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
# In[87]: #Python code with loop for i in range (number): hemisphere = {} i = i + 1 print(i) try: browser.find_by_css('a.product-item')[i].click() except: continue hemi_href = browser.find_link_by_text('Sample').first hemisphere['img_url'] = hemi_href['href'] hemisphere['title'] = browser.find_by_css('h2.title').text hemisphere_images.append(hemisphere) print(i) browser.back() # In[88]: #flat url hemisphere_images
browser.visit(url) # In[17]: # 2. Create a list to hold the images and titles. hemisphere_image_urls = [] # 3. Write code to retrieve the image urls and titles for each hemisphere. for item in range(4): hemisphere = {} # Find Element on Each Loop to Avoid a Stale Element Exception browser.find_by_css("a.product-item h3")[item].click() # Find Sample Image Anchor Tag & Extract <href> sample_element = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] # Get Hemisphere Title hemisphere["title"] = browser.find_by_css("h2.title").text # Append Hemisphere Object to List hemisphere_image_urls.append(hemisphere) # Navigate Backwards browser.back() # In[18]: # 4. Print the list that holds the dictionary of each image url and title. hemisphere_image_urls
typeaName = typeaNameList[cType] if cType == 0 or cType == 1: # pass for igType in igTypeListScotia: # fdsf browser.find_by_name('addScenario').first.click() browser.fill('scName', countryTypeList[conIndex]+typeaName+igType) browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b') browser.type('scEffDate', '2015-10-31') browser.find_by_name('update').first.click() browser.find_link_by_text('Obligor').first.click() # choose the companyType type element = browser.find_by_name('companyType').first element.select(str(cType)) browser.fill('obligorName', companyName) browser.find_by_name('ObligorSearch').first.click() browser.find_link_by_partial_href('javascript:refPortResult')[0].click() # select "B-III counterpaty type" to be "corporate" element = browser.find_by_name('counterPartyType').first element.select('1') # select "Classification re Asset Value Correlation" to be "Non-Financial Institution (N)" element = browser.find_by_name('avc').first
def scrape_all(): # Set the executable path and initialize the chrome browser in splinter executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path) # Visit the mars nasa news site url = 'https://mars.nasa.gov/news/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Convert the browser html to a soup object and then quit the browser html = browser.html news_soup = BeautifulSoup(html, 'html.parser') slide_elem = news_soup.select_one('ul.item_list li.slide') slide_elem.find("div", class_='content_title') # Use the parent element to find the first a tag and save it as `news_title` news_title = slide_elem.find("div", class_='content_title').get_text() # Use the parent element to find the paragraph text news_p = slide_elem.find('div', class_="article_teaser_body").get_text() # JPL Space Images Featured Image - Visit URL url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # Find and click the full image button full_image_elem = browser.find_by_id('full_image') full_image_elem.click() # Find the more info button and click that browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() # Parse the resulting html with soup html = browser.html img_soup = BeautifulSoup(html, 'html.parser') # find the relative image url img_url_rel = img_soup.select_one('figure.lede a img').get("src") # Use the base url to create an absolute url img_url = f'https://www.jpl.nasa.gov{img_url_rel}' #Mars weather - visit url url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html weather_soup = BeautifulSoup(html, 'html.parser') # First, find a tweet with the data-name `Mars Weather` mars_weather_tweet = weather_soup.find('div', attrs={ "class": "tweet", "data-name": "Mars Weather" }) # Next, search within the tweet for the p tag containing the tweet text mars_weather = mars_weather_tweet.find('p', 'tweet-text').get_text() #Hemispheres of Mars url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemisphere_image_urls = [] # First, get a list of all of the hemispheres links = browser.find_by_css("a.product-item h3") # Next, loop through those links, click the link, find the sample anchor, return the href for i in range(len(links)): hemisphere = {} # We have to find the elements on each loop to avoid a stale element exception browser.find_by_css("a.product-item h3")[i].click() # Next, we find the Sample image anchor tag and extract the href sample_elem = browser.find_link_by_text('Sample').first hemisphere['img_url'] = sample_elem['href'] # Get Hemisphere title hemisphere['title'] = browser.find_by_css("h2.title").text # Append hemisphere object to list hemisphere_image_urls.append(hemisphere) # Finally, we navigate backwards browser.back() #mars facts df = pd.read_html('https://space-facts.com/mars/')[0] df.columns = ['description', 'value'] df.set_index('description', inplace=True) df = df.to_html() #final data dictionary data = { "news_title": news_title, "news_paragraph": news_p, "featured_image": img_url, "hemispheres": hemisphere_image_urls, "weather": mars_weather, "facts": df, "last_modified": dt.datetime.now() } browser.quit() return data
def scrape(): # Import dependencies ---------------------------------------------------------------- from splinter import Browser from bs4 import BeautifulSoup as bs import requests import time import pandas as pd # set up Splinter ---------------------------------------------------------------------- executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # 1. NASA Mars News--------------------------------------------------------------------- ## Scrape the NASA Mars News Site (https://mars.nasa.gov/news) and collect the latest News Title and Paragraph Text ## Assign the text to variables to reference later #! can't use requests library here, because the news are rendered by js after page is load; if use requests.get, it will only return the contents before rendering # 1.1 Retrieve page with splinter url_news = "https://mars.nasa.gov/news" browser.visit(url_news) html = browser.html # 1.2 Get the first news from html retrieved # Create BeautifulSoup object; parse with 'html.parser' bsoup = bs(html, 'html.parser') # reach the container of the first news li = bsoup.find("li", class_="slide") news_t = li.find("div", class_="content_title").text # title news_p = li.find("div", class_="article_teaser_body").text # paragraph news_link = url_news.replace("/news", "") + li.find( "div", class_="content_title").a[ "href"] # link to the news (added to base url) news_date = li.find("div", class_="list_date").text # date # 2. JPL Mars Space Images - Featured Image---------------------------------------------- ## Get the current Featured Image from JPL (https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars) url_img = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" # navigate to to full-size image url with splinter browser.visit(url_img) browser.click_link_by_partial_text('FULL IMAGE') # ---if try to click on the more info button, directly, sometimes it returns an error "element not visible" # ---the only way to avoid that it to wait until the element becomes visible, which takes time # --- the workaround is to get the href link and visit it insteading of trying to click the link directly # time.sleep(30) # browser.click_link_by_partial_text('more info') href = browser.find_link_by_partial_text("more info")[0]["href"] browser.visit(href) browser.find_by_css(".main_image").click() # store the image url featured_image_url = browser.url # 3. Mars Weather ------------------------------------------------------------------------ ## Visit the Mars Weather twitter account page (https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page # 3.1 Retrieve page using requests url_twitter = "https://twitter.com/marswxreport?lang=en" html = requests.get(url_twitter).text # 3.2 Get the weather post from html retrieved bsoup = bs(html, "html.parser") # all tweets are under ol ol = bsoup.find(id="stream-items-id") # put tweets in lis list lis = ol.findAll("li") # use a for loop to find the first tweet with weather info (criterion: has hPa in the post) mars_weather = "" for li in lis: tweet = li.find("div", class_="js-tweet-text-container").p.text if tweet.find("hPa"): mars_weather = tweet break # 4. Mars Facts---------------------------------------------------------------------------- ## Visit the Mars Facts webpage (https://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. # Use Pandas to convert the data to a HTML table string. url_fact = "https://space-facts.com/mars/" # use pandas to scrape tabular data from the page tables = pd.read_html(url_fact) facts = tables[0] # store data in a list of lists facts = facts.values.tolist() # 5. Mars Hemispheres------------------------------------------------------------------------- ## Visit the USGS Astrogeology site (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres. # 5.1 Retrieve the html with splinter url_hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_hemi) html = browser.html # 5.2 Get the urls needed from the html retrieved bsoup = bs(html, "html.parser") items = bsoup.findAll("div", class_="item") hemisphere_image_urls = [] # initialize list for item in items: title = item.find("h3").text # title url = "https://astrogeology.usgs.gov/" + item.find( "div", class_="description").a["href"] # get the url for picture details browser.visit(url) img_url = browser.find_link_by_text("Sample")[0][ "href"] # get the url to the full-size picture hemisphere_image_urls.append({ "title": title, "img_url": img_url }) # append a dictionary to the hemisphere_image_urls list # store data scraped into a dictionary-------------------------------------------------------------------- data = { "news": { "title": news_t, "body": news_p, "link": news_link, "date": news_date }, "feature_img": featured_image_url, "weather": mars_weather, "facts": facts, "hemi_img": hemisphere_image_urls } print(data) # print to console return data
def scrape(): # Scrape NASA Mars News Site and collect news title and paragraph text url = 'https://mars.nasa.gov/news/' response = requests.get(url) soup = bs(response.text, 'lxml') # Create variables for title and paragraph text news_title = soup.find('div', class_='content_title').text paragraph_text = soup.find('div', class_='rollover_description_inner').text #Visit the URL for JPL's Space Images-Use splinter to navigate site executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') # Click the 'FULL IMAGE' button browser.click_link_by_partial_text('FULL IMAGE') # Click the 'more info' button browser.click_link_by_partial_text('more info') # Get feature image url from 'more info' page html_2 = browser.html soup_2 = bs(html_2, 'html.parser') img_url = soup_2.find('img', class_='main_image') end_img_url = img_url.get('src') feature_image_url = 'https://www.jpl.nasa.gov' + end_img_url # Scrape latest Mars weather tweet from 'https://twitter.com/marswxreport?lang=en' url = 'https://twitter.com/marswxreport?lang=en' twitter_resp = requests.get(url) twitter_soup = (bs(twitter_resp.text, 'html.parser').find( 'div', class_='js-tweet-text-container')).text.strip() # Create a pandas dataframe containing facts scraped from 'https://space-facts.com/mars/' mars_facts_request = requests.get('https://space-facts.com/mars/') mars_facts_table = pd.read_html(mars_facts_request.text) mars_facts_table mars_facts_df = mars_facts_table[0] mars_facts_df # Visit USGS Astrology site, 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' to obtain #high reloution images of each of Mar's hemispheres usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(usgs_url) # Find image url for full resolution image links = browser.find_by_css("a.product-item h3") # Save the image url string and hemisphere title containing hemisphere name in python dictionary hemisphere_img_urls = [] # Itterate through the links for link in range(len(links)): images = {} browser.find_by_css("a.product-item h3")[link].click() #print(browser.find_by_css("a.product-item h3")[link]) image_url = browser.find_link_by_text('Sample') images['img_url'] = image_url['href'] #print(image_url) browser.find_by_css('h2.title').text image_title = browser.find_by_css('h2.title').text images['title'] = image_title hemisphere_img_urls.append({ "title": image_title, "image_url": image_url }) browser.back() mars_dictionary = { "Nasa_Title": news_title, "Nasa_Paragraph": paragraph_text, "Feature_Image": feature_image_url, "Mars_Weather": twitter_soup, "Mars_Facts": mars_facts_df, "Hemispheres": hemisphere_img_urls } browser.quit() return mars_dictionary
typeaNameList = ['scotia_pub_', 'scotia_priv_', 'nonsco_pub_', 'nonsco_priv_'] typeaNameList2 = ['scotia_pub', 'scotia_priv', 'nonsco_pub', 'nonsco_priv'] devdCompanyNameList = ['TOYOTA TSUSHO CORPORATION','BASCO DESEMBUAGE','A&T CORPORATION','asdf'] devgCompanyNameList = [countryList[1],countryList[1],countryList[1],'asdf'] # open browser, navigate to the right page, configure, from splinter import Browser from splinter import driver browser = Browser() browser.visit('https://clientnet-uat.gbm.bns:8090/CNETADMIN/login.jsp?ssosrc=http%3A%2F%2Fclientnet-uat.gbm.bns%2FCNETCOMM%2Findex.do') browser.fill('uid', 'ychoe') browser.fill('pwd', 'Winter15') browser.find_by_name('signin').first.click() browser.find_link_by_text('International Banking').first.click() browser.fill('clientSearchString', 'RF vs RFDA test cases') browser.find_by_name('search').first.click() browser.find_by_value('GO').first.click() for conIndex in range(2): for cType in range(4): if conIndex == 0: companyName = devdCompanyNameList[cType] else: companyName = devgCompanyNameList[cType] typeaName = typeaNameList[cType]
items = soup(html, 'html.parser') # In[22]: #hemisphere_image_urls = [] #links = browser.find_css("a.product-items h3") hemisphere_image_urls = [] # First, get a list of all of the hemispheres links = browser.find_by_css("a.product-item h3") # Next, loop through those links, click the link, find the sample anchor, return the href for i in range(len(links)): hemisphere = {} # We have to find the elements on each loop to avoid a stale element exception browser.find_by_css("a.product-item h3")[i].click() # Next, we find the Sample image anchor tag and extract the href sample_elem = browser.find_link_by_text('Sample').first hemisphere['img_url'] = sample_elem['href'] # Get Hemisphere title hemisphere['title'] = browser.find_by_css("h2.title").text # Append hemisphere object to list hemisphere_image_urls.append(hemisphere) # Finally, we navigate backwards browser.back() # In[23]: # 4. Print the list that holds the dictionary of each image url and title. hemisphere_image_urls # In[24]:
def scrape(): browser = init_browser() #Mars dict to hold info mars_data={} # Get Mars news url = "https://mars.nasa.gov/news/" browser.visit(url) time.sleep(1) html = browser.html soup = bs(html, "html.parser") # find new news article titles news_title = soup.find("div",class_="content_title").text # find new news articles text news_text = soup.find("div", class_="article_teaser_body").text #Get Mars img from JPL jpl_images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(jpl_images_url) time.sleep(1) html = browser.html soup = bs(html, "html.parser") img_source = soup.find(class_ = "carousel_item")['style'] # Use split to get the text portion just related to the full size image URL. string_split = img_source.split("'")[0] image_split = img_source.split("'")[1] # Combine with base url to make complete url for image featured_image_url = jpl_images_url + image_split #Twitter scrape executable_path = {"executable_path": "chromedriver.exe"} browser = Browser("chrome", **executable_path) twit_url = "https://twitter.com/marswxreport?lang=en" browser.visit(twit_url) # Use find by css method with click to access tweet. Tried with beautiful soup in jupyter notebook # Resource website used https://www.seleniumeasy.com/selenium-tutorials/css-selectors-tutorial-for-selenium-with-examples browser.find_by_css('div[class="css-1dbjc4n r-1awozwy r-18u37iz r-1wtj0ep"]').first.click() #find and save text from tweet. End up in [6] location target_tweet = browser.find_by_css('span[class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0"]')[6].text #Mars Facts Scrape facts_url = "https://space-facts.com/mars/" browser.visit(facts_url) mars_facts_df = pd.read_html(facts_url)[0] mars_facts_df.columns=["Facts", "Values"] mars_facts_df.set_index("Facts", inplace=True) mars_facts_html = mars_facts_df.to_html() # Mars Hemispheres mars_hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(mars_hemi_url) hemi_img_url = [] links = browser.find_by_css("a.product-item h3") n = len(links) for row in range(n): hemi_dict = {} browser.find_by_css("a.product-item h3")[row].click() sample_element = browser.find_link_by_text("Sample").first # Update dictionary with image url and title hemi_dict["img_url"] = sample_element["href"] hemi_dict["title"] = browser.find_by_css("h2.title").text # Append it to hemi_img_url.append(hemi_dict) # Need to send browser back each time in order to click each product-item. browser.back() #Update mars_data with information mars_data = { "mars_news_title": news_title, "mars_news_teaser": news_text, "mars_tweet": target_tweet, "mars_image": featured_image_url, "mars_table": mars_facts_html, "hemi_image_title_1": hemi_img_url[0]["title"], "hemi_image_url_1": hemi_img_url[0]["img_url"], "hemi_image_title_2": hemi_img_url[1]["title"], "hemi_image_url_2": hemi_img_url[1]["img_url"], "hemi_image_title_3": hemi_img_url[2]["title"], "hemi_image_url_3": hemi_img_url[2]["img_url"], "hemi_image_title_4": hemi_img_url[3]["title"], "hemi_image_url_4": hemi_img_url[3]["img_url"] } browser.quit() return mars_data
def scrape(): ############ defining the urls needed ####--------------------------------------- ##url for the Nasa news site nasa_news_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" ####--------------------------------------- ##urls for the jpl site #base url for jpl site jpl_base_url = "https://www.jpl.nasa.gov" #url for "Mars" search for jpl jpl_mars_url = jpl_base_url + "/spaceimages/?search=&category=Mars" ####--------------------------------------- ##url for space facts page mars_facts_url = "https://space-facts.com/mars/" ####--------------------------------------- ##urls for hemisphere data #base url for hemisphere site hem_base_url = "https://astrogeology.usgs.gov" #page we'll use to access each hemisphere page hem_url = hem_base_url + "/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" #using splinter since requests is only returning partial results executable_path = { 'executable_path': "C:/Program Files (x86)/Chrome Driver/chromedriver.exe" } browser = Browser('chrome', executable_path, headless=False) ########### in this block, we're going to try to run all the browser.visit(x) and html=browser.html statements so we can close the the browser out more quickly #opening browser to the nasa news website browser.visit(nasa_news_url) time.sleep(2) #extracting html from the nasa news website nasa_news_html = browser.html #opening browser to jpl site browser.visit(jpl_mars_url) time.sleep(2) #extracting html jpl_html = browser.html # browser.quit() #visiting mars facts page and grabbing html browser.visit(mars_facts_url) time.sleep(2) ### defining the html mars_facts_html = browser.html ####------------------------------------------------------------------------------ ####visiting the mars hemispheres site and grabbing the html browser.visit(hem_url) time.sleep(2) hem_html = browser.html #parsing hemisphere site hem_soup = bs(hem_html, 'html.parser') ### get the image url string, hemisphere title containing the hemispher name ##a results set that contain the links to each hemisphere hem_items = hem_soup.find_all(class_="item") hem_list = [] ### visit each hemisphere page and... for result in hem_items: print("--------") #opens to the browser to the current hemisphere page browser.visit(hem_base_url + result.a["href"]) time.sleep(1) ##opening the image to view it full size browser.find_link_by_text("Open").first.click() time.sleep(1) #defines the current html current_html = browser.html cur_soup = bs(current_html, 'html.parser') # # adds to the dictionary. The key is the hemisphere title and the value is the image link current_dict = {} current_dict["title"] = cur_soup.find('title').text current_dict["img_url"] = hem_base_url + cur_soup.find( 'img', class_="wide-image")['src'] hem_list.append(current_dict) browser.quit() #parsing the html nasa_news_soup = bs(nasa_news_html, 'html.parser') #finds the list in the html which contains the article titles and paragraphs we seek first_art = nasa_news_soup.find(class_="item_list") # print(first_art.text) ## gets the articles title first_art.find(class_="content_title").text first_art.find(class_="article_teaser_body").text art_dict = { "news_title": first_art.find(class_="content_title").text, "news_p": first_art.find(class_="article_teaser_body").text } #parsing the html for the jpl site jpl_soup = bs(jpl_html, 'html.parser') ## finds the partial url for the first image partial_jpl_image_url = jpl_soup.find( class_="slide").a["data-fancybox-href"] ## joins the partial url to the initial url to get the full image url featured_image_url = jpl_base_url + partial_jpl_image_url #grabbing the first table from the mars facts html and turning it into a dataframe fact_df = pd.read_html(mars_facts_html)[0] fact_df fact_table = fact_df.to_html(header=False, index=False) ##final dictionary final_dict = { "Hemispheres": hem_list, "Articles": art_dict, "FeaturedImage": featured_image_url, "Facts": fact_table } print("scraping done") return final_dict
# bbh=buyh-1 # else: # bbm=buym-1 # bbh=buyh # # for test print("start.....") # t3 = datetime.strptime(buyd + ' ' + buyt, '%Y-%m-%d %H:%M:%S') #t3=datetime.strptime('2017-03-02 00:00:00','%Y-%m-%d %H:%M:%S') #b.visit('https://passport.jd.com/new/login.aspx?ReturnUrl=https://cart.jd.com/order/orderInfoMain.html') #b.visit('https://passport.jd.com/new/login.aspx?ReturnUrl=https://item.jd.com/4325034.html') url = 'https://passport.jd.com/new/login.aspx?ReturnUrl=https://item.jd.com/' + item + '.html' # b.visit('https://passport.jd.com/new/login.aspx?ReturnUrl=https://item.jd.com/4390094.html') b.visit(url) uu = b.find_link_by_text('账户登录') uu.click() b.fill('loginname', user) # b.fill('loginname','18371542519') b.fill('nloginpwd', ppp) b.find_by_id('loginsubmit').click() #b.visit('https://item.jd.com/3763103.html') tt = datetime.now() tttt = t3 - tt ttt = tttt.days * 24 * 60 * 60 + tttt.seconds while (ttt > -60 * 15): tt = datetime.now() tttt = t3 - tt ttt = tttt.days * 24 * 60 * 60 + tttt.seconds
countryList = ['australia','chile'] countryTypeList = ['developed_', 'developing_'] typeaNameList = ['Existing_Scotia_Public', 'Existing_Scotia_Private', 'Non_Scotia_Public', 'Non_Scotia_Private'] devdCompanyNameList = ['TOYOTA TSUSHO CORPORATION','BASCO DESEMBUAGE','A&T CORPORATION','asdf'] devgCompanyNameList = [countryList[1],countryList[1],countryList[1],'asdf'] # open browser, navigate to the right page, configure, from splinter import Browser browser = Browser() from selenium.webdriver.common.keys import Keys from selenium import webdriver browser.visit('https://clientnet-uat.gbm.bns:8090/CNETADMIN/login.jsp?ssosrc=http%3A%2F%2Fclientnet-uat.gbm.bns%2FCNETCORP%2Findex.do') browser.fill('uid', 'ychoe') browser.fill('pwd', 'Winter15') browser.find_by_name('signin').first.click() browser.fill('clientSearchString', 'jason\'s client') browser.find_by_name('search').first.click() browser.find_by_value('GO').first.click() while len(browser.find_link_by_text('Delete'))>0: browser.find_link_by_text('Delete').first.click() browser.get_alert().accept()
def scrape_all(): # Initiate headless driver for deployment executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # Go to the NASA Mars News Site url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url) # Create a Beautiful Soup object soup = bs(browser.html, 'lxml') news_title = soup.find_all('div', class_='content_title') news_articles = [] for news in news_title: if (news.a): if (news.a.text): news_articles.append(news.a.text) # Print paragraph for the latest news article news_story = soup.find_all('div', class_='article_teaser_body') news_paragraph = [] for paragraph in news_story: if (paragraph.text): news_paragraph.append(paragraph.text) # Create variables for our latest news article and paragraph first_article = news_articles[0] news_p = news_paragraph[0] data["news_title"] = first_article data["news_paragraph"] = news_p # Visit the url for JPL Featured Space Image url_2 = 'https://www.jpl.nasa.gov/images?search=&category=Mars' browser.visit(url_2) # Create a Beautiful Soup object soup2 = bs(browser.html, 'lxml') # Find and append the links(href) for each image featured on the page article_images = soup2.find_all('a', class_="group cursor-pointer block") image_links = [] for image in article_images: image_links.append(image['href']) # Scrape through the first href and find the full sized image url soup2 = bs(browser.html, 'lxml') domain_url = 'https://' + browser.url.replace('http://', '').replace( 'https://', '').split('/', 1)[0] browser.visit(domain_url + image_links[0]) soup3 = bs(browser.html, 'lxml') img_url = soup3.find_all('div', class_="lg:w-auto w-full") img_href = [] for i in img_url: if (i.a): if (i.a['href']): img_href.append(i.a['href']) featured_image_url = img_href[0] data["featured_image"] = featured_image_url # Visit the Mars Facts webpage url_3 = 'https://space-facts.com/mars/' browser.visit(url_3) # Create a Beautiful Soup object soup3 = bs(browser.html, 'lxml') # Scrape the table containing facts about the planet including Diameter, Mass, etc. mars_facts = pd.read_html(browser.html) table_df = mars_facts[0] # Use Pandas to convert the data to a HTML table string. table_df.columns = ["description", "value"] data["facts"] = table_df.to_html(index=False) # Bring in the USGS Astrogeology site for our web scrapping url_4 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_4) # Create a Beautiful Soup object soup4 = bs(browser.html, 'lxml') hemisphere_image_urls = [] # Run a for loop to click through our hemisphere links in order to # append the titles & urls for the full resolution hemisphere images links = browser.find_by_css("a.product-item h3") for item in range(len(links)): hemisphere = {} browser.find_by_css("a.product-item h3")[item].click() # find urls for the full resolution hemisphere images aref_list = browser.find_link_by_text("Sample").first hemisphere["img_url"] = aref_list["href"] # find the titles for our hemisphere images hemisphere["title"] = browser.find_by_css("h2.title").text # append titles & urls for our hemisphere images hemisphere_image_urls.append(hemisphere) browser.back() data["hemispheres"] = hemisphere_image_urls browser.quit() return data
countryTypeList = ['developed_', 'developing_'] typeaNameList = ['Existing_Scotia_Public', 'Existing_Scotia_Private', 'Non_Scotia_Public', 'Non_Scotia_Private'] devdCompanyNameList = ['TOYOTA TSUSHO CORPORATION','BASCO DESEMBUAGE','A&T CORPORATION','asdf'] devgCompanyNameList = [countryList[1],countryList[1],countryList[1],'asdf'] # open browser, navigate to the right page, configure, from splinter import Browser browser = Browser() from selenium.webdriver.common.keys import Keys from selenium import webdriver browser.visit('https://clientnet-uat.gbm.bns:8090/CNETADMIN/login.jsp?ssosrc=http%3A%2F%2Fclientnet-uat.gbm.bns%2FCNETCOMM%2Findex.do') browser.fill('uid', 'ychoe') browser.fill('pwd', 'Winter15') browser.find_by_name('signin').first.click() browser.find_link_by_text('International Banking').first.click() browser.fill('clientSearchString', 'RF vs RFDA test cases') browser.find_by_name('search').first.click() browser.find_by_value('GO').first.click() while len(browser.find_link_by_text('Delete'))>0: browser.find_link_by_text('Delete').first.click() browser.get_alert().accept()
def scrape(): mars = mongo.db.mars # Put everything from Jupyter Notebook Here # Set the executable path and initialize the chrome browser in splinter executable_path = { 'executable_path': 'C:\\Users\\enere\\Desktop\chromedriver' } browser = Browser('chrome', **executable_path) ##### MARS NEWS Scrape ##### # Visit the mars nasa news site url = 'https://mars.nasa.gov/news/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Convert the browser html to a soup object and then quit the browser html = browser.html # Convert the browser html to a soup object and then quit the browser html = browser.html news_scraper = BeautifulSoup(html, 'html.parser') # Use the parent element to find the first a tag and save it as `news_title` title_element = news_scraper.find('div', {'class': 'content_title'}) news_title = title_element.get_text() # Use the parent element to find the paragraph text teaser_element = news_scraper.find('div', {'class': 'article_teaser_body'}) teaser_text = teaser_element.get_text() ##### JPL Space Images Featured Image ##### # Visit URL url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) full_image_elem = browser.find_by_id('full_image') full_image_elem.click() # Find the more info button and click that browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() # Parse the resulting html with soup html = browser.html img_scraper = BeautifulSoup(html, 'html.parser') # find the relative image url img_element = img_scraper.find('img', {'class': 'main_image'}) # find the relative image url img_src = img_element.get('src') # Use the base url to create an absolute url img_url = f'https://www.jpl.nasa.gov{img_src}' ##### Mars Weather Scrape # Visit URL url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html weather_soup = BeautifulSoup(html, 'html.parser') # First, find a tweet with the data-name `Mars Weather` mars_weather_tweet = weather_soup.find('div', attrs={ "class": "tweet", "data-name": "Mars Weather" }) # Next, search within the tweet for the p tag containing the tweet text mars_weather = mars_weather_tweet.find('p', 'tweet-text').get_text() mars_weather ##### Mars Facts Scrape # Visit URL url = 'https://space-facts.com/mars/' browser.visit(url) tables = pd.read_html(url) html_table = df.to_html() df.to_html('table.html') type(tables) df = tables[0] df.columns = ['Mars - Earth Comparison', 'Mars', 'Earth'] # Set the index to Mars - Earth Comparison column df.set_index('Mars - Earth Comparison', inplace=True) #convert DataFrames back to HTML tables using the to_html function html_table = df.to_html() ##### Mars Hemisphere Scrape url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemisphere_image_urls = [] # First, get a list of all of the hemispheres links = browser.find_by_css("a.product-item h3") # Next, loop through those links, click the link, find the sample anchor, return the href for i in range(len(links)): hemisphere = {} # We have to find the elements on each loop to avoid a stale element exception browser.find_by_css("a.product-item h3")[i].click() # Next, we find the Sample image anchor tag and extract the href sample_elem = browser.find_link_by_text('Sample').first hemisphere['img_url'] = sample_elem['href'] # Get Hemisphere title hemisphere['title'] = browser.find_by_css("h2.title").text # Append hemisphere object to list hemisphere_image_urls.append(hemisphere) hemisphere_image_urls # Finally, we navigate backwards browser.back() browser.quit() ##### Create a dictionary to store our scraped data scraped_data = { 'News Title': news_title, 'Teaser Text': teaser_text, 'Image URL': img_url, 'Mars Weather': mars_weather, 'Mars Hemisphere': hemisphere_image_urls, 'Mars Facts': html_table } ##### Put into MongoDB mars.update({}, scraped_data, upsert=True) return jsonify(scraped_data)
# 2. Create a list to hold the images and titles. hemisphere_image_urls = [] # 3. Write code to retrieve the image urls and titles for each hemisphere. links = browser.find_by_css('a.product-item h3') # Loop through the links for i in range(len(links)): hemisphere = {} # find elements on each loop and click on link browser.find_by_css('a.product-item h3')[i].click() # find Sample image anchor tag and extract the href samp_element = browser.find_link_by_text('Sample').first img_url = samp_element['href'] hemisphere["img_url"] = img_url # Get hemisphere title title = browser.find_by_css("h2.title").text hemisphere["title"] = title # Append hemisphere object to list hemisphere_image_urls hemisphere_image_urls.append(hemisphere) # Navigate back browser.back() # In[138]:
def main(argv): email = None txtipt = None socks = None socksPort = None try: opts, args = getopt.getopt(argv, "hi:m:S:P:",["port=","socks=","input=","mail=","help"]) except: print "Use --help for help" sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): print 'Usage %s options \n' % (os.path.basename(__file__)) print ' -h, --help This help' print ' -m, --mail Your facebook login email' print ' -i, --input Your input file name' print ' -S, --socks Socks Proxy Address for Tor use' print ' -P, --port Port Socks for Tor use' sys.exit() elif opt in ("-i","--input"): txtipt = arg elif opt in ("-m","--mail"): email = arg elif opt in ("-S","--socks"): socks = arg elif opt in ("-P","--port"): socksPort = arg if not email or not txtipt: print 'Use --help for help' sys.exit() password = getpass.getpass() if socks and socksProt: proxy_settings = { 'network.proxy.type':1, 'network.proxy.socks': socks, 'network.proxy.socks_port': socksPort } browser = Browser('firefox',profile_preferences=proxy_settings) else: browser = Browser() # with Browser() as browser: browser.visit('https://m.facebook.com/') browser.fill("email",email); browser.fill("pass",password); browser.find_by_name("login").click() if browser.is_element_present_by_css('.login_error_box'): print 'The email and password didn\'t work.' sys.exit() try: fileipt = open(txtipt, 'r') except: sys.exit('Unable to open file %s' % txtipt) for line in fileipt: browser.visit(line) addButton = browser.find_link_by_text('Add Friend') if len(addButton) > 0: addButton[0].click()
import config from splinter import Browser browser = Browser() browser.visit('https://studentemployment.neu.edu/tsx_studentjobs.aspx') browser.fill('Skin$ctl08$LoginNameText', config.username) browser.fill('Skin$ctl08$LoginPasswordText', config.password) browser.find_by_name('Skin$ctl08$ctl14').click() browser.click_link_by_text(config.jobTitle) browser.find_link_by_text('Go to time sheet').first.click() # browser.find_link_by_text('Start time sheet').first.click() # alert = browser.get_alert() # alert.accept() def addShift(shift): browser.click_link_by_text('Add New Entry') browser.find_by_id('Skin_body_ctl01_WDL').find_by_css('option')[ shift.day].click() browser.find_by_id('Skin_body_ctl01_StartDateTime1').select(shift.start) browser.find_by_id('Skin_body_ctl01_EndDateTime1').select(shift.end) browser.find_by_value('Add').first.click() for shift in config.shifts:
class DownPatent(object): def __init__(self, db, down_url): self.db = db self.down_url = down_url self.browser = Browser("phantomjs", wait_time=10) #self.browser = Browser() #下载专利 def download(self, patentno): #访问网页 #网页加载超时 #down_flag, 0:未下载,1:不存在,2:下载失败 download_link = "" down_flag = 0 if True: print "打开网页" self.browser.visit(self.down_url) if not self.browser.is_element_not_present_by_value("查询", wait_time=10): #填写专利号 self.browser.fill("cnpatentno", patentno) self.browser.find_by_value("查询").first.click() print "填写专利号" #连接超时,404 if self.browser: print "打开验证码网页" #一个最多循环20次 code_handler = CodeHandler() #填写验证码 list_fill_text = [] #验证码路径 list_code_path = [] #验证码分割标志 list_split_flag = [] #验证码识别标志 list_reg_flag = [] for code_num in xrange(20): print code_num #查找验证码 if not self.browser.is_element_not_present_by_id("getcode", wait_time=5): print "查找验证码" #截图 #self.browser.driver.maximize_window() self.browser.driver.save_screenshot("screenshot.png") #获取验证码图片 image = Image.open("screenshot.png") image_location = self.find_location(image) image_code = image.crop((image_location[0], image_location[1], image_location[0]+52, image_location[1]+21)) save_path = "static/images/onlinecode/" + time.ctime() + ".png" save_path_temp = "../%s" % save_path image_code.save(save_path_temp) list_code_path.append(save_path) #分割图片 list_split_image = self.deal_split(code_handler, image_code) #识别,如果能正确识别,则识别,不能,则重新获取验证码 if len(list_split_image) == 4: print "正确分割" list_split_flag.append(1) reg_plain_text = self.reg_code(list_split_image) fill_text = "".join(reg_plain_text) list_fill_text.append(fill_text) #填写验证码 #hand_fill_text = raw_input("Enter fill text:") self.browser.fill("ValidCode", fill_text) self.browser.find_by_value("确定").first.click() print self.browser.html.encode("utf-8").find("验证码输入错误") if self.browser.html.encode("utf-8").find("验证码输入错误") == -1: list_reg_flag.append(1) if self.browser.html.encode("utf-8").find("没有找到该专利") == -1: down_link_one = self.browser.find_link_by_text("申请公开说明书图形下载(标准版)") down_link_two = self.browser.find_link_by_text("申请公开说明书图形下载(极速版)") if down_link_one or down_link_two: print "查找说明书图形下载链接" list_reg_flag.append(1) if down_link_one: self.browser.click_link_by_text("申请公开说明书图形下载(标准版)") else: self.browser.click_link_by_text("申请公开说明书图形下载(极速版)") print "查找下载链接" #查找下载链接 download_a = self.browser.find_link_by_text("下载专利") if download_a: download_link = download_a["href"] #找到下载链接 down_flag = 3 break else: print "下载失败" #下载失败 down_flag = 2 break ''' else: print "识别正确,未找到链接" list_reg_flag.append(0) self.browser.back() self.browser.reload() ''' else: print "不存在专利" #没有专利 down_flag = 1 break else: print "识别错误,重新加载" list_reg_flag.append(0) self.browser.back() self.browser.reload() else: print "不能分割" list_fill_text.append("") list_split_flag.append(0) list_reg_flag.append(0) self.browser.reload() #存入数据集onlinecode,专利号,验证码路径,识别码,识别标志,不可分标志,时间 for code_path, fill_text, split_flag, reg_flag in zip(list_code_path,list_fill_text, list_split_flag, list_reg_flag): try: self.db.onlinecode.insert({"indexflag": patentno, "codepath": code_path, "filltext": fill_text, \ "splitflag": split_flag, "regflag": reg_flag, "time": time.ctime()}) except: pass return download_link #处理验证码 def deal_split(self, code_handler, image): list_split_image = code_handler.main_deal_split(image) return list_split_image #识别 def reg_code(self, list_split_image): all_plain_text = "0123456789abcdef" reg_plain_text = [] neural = NeuralWork() list_input_data = [] for each_split_image in list_split_image: each_input_data = [] for x in xrange(each_split_image.size[1]): for y in xrange(each_split_image.size[0]): if each_split_image.getpixel((y, x)): each_input_data.append(0) else: each_input_data.append(1) list_input_data.append(each_input_data) out = neural.reg_net(list_input_data) for each in out: plain_text = int(round(each[0] * 100)) if plain_text < 16: reg_plain_text.append(all_plain_text[plain_text]) return reg_plain_text #查找验证码图片位置 def find_location(self, image): image = image.convert("L") image_width = image.size[0] image_height = image.size[1] flag = image_width location = [0, 0] for y in xrange(image_width): for x in xrange(image_height): if image.getpixel((y, x)) != 0: flag = y break if flag != image_width: location[0] = y location[1] = x break return location
def scrape(): # https://splinter.readthedocs.io/en/latest/drivers/chrome.html get_ipython().system('which chromedriver') # # NASA Mars News executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) html = requests.get(url) soup = bs(html.text, 'lxml') title_results = soup.find_all('div', class_="content_title") paragraph_results = soup.find_all('div', class_="rollover_description_inner") news_title = soup.find('div', class_="content_title").text.strip() news_p = soup.find('div', class_="rollover_description_inner").text.strip() # # JPL Mars Space Images executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') result = soup.find('div', class_="default floating_text_area ms-layer") featured_image = result.footer.a['data-fancybox-href'] featured_image_url = f'http://www.jpl.nasa.gov{featured_image}' # # Mars Weather url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') current_weather = soup.find('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text # # Mars Facts url = 'https://space-facts.com/mars/' table = pd.read_html(url) df = table[0] df.columns = ['Profile', 'Value'] df.to_html('table.html',index=False, justify='center') # # Mars Hemisphere executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemisphere_image_urls = [] links = browser.find_by_css("h3") for i in range(len(links)): hemisphere = {} browser.find_by_css("h3")[i].click() sample = browser.find_link_by_text('Sample').first hemisphere['img_url'] = sample['href'] hemisphere['title'] = browser.find_by_css("h2.title").text hemisphere_image_urls.append(hemisphere) browser.back() return render_template('index.html', news_p = news_p, news_title = news_title, featured_image_url = featured_image_url, current_weather = current_weather)
def scrape_info(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #NASA NEWS url = 'https://mars.nasa.gov/news/' browser.visit(url) browser.is_element_present_by_css('li.slide', wait_time=10) html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') article = soup.find('div', class_="article_teaser_body").text title = soup.find(class_="bottom_gradient") title = title.find('h3').text #JPL MARS SPACE IMAGE url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'lxml') footer = soup.find('footer') string_footer = str(footer) string_footer = string_footer.split('data-fancybox-href="')[1].split( '" data-link')[0] featured_image_url = 'https://www.jpl.nasa.gov' + string_footer #MARS WEATHER TWITTER url = 'https://twitter.com/MarsWxReport' driver = webdriver.Chrome() driver.implicitly_wait(5) # seconds driver.get(url) element = driver.find_element_by_class_name("css-901oao") tweet = element.text # MARS FACTS url = "https://space-facts.com/mars/" tables = pd.read_html(url, index_col=0) mars_facts_df = tables[0] html_table = mars_facts_df.to_html() # MARS HEMISHERES url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') hemispheres = soup.find_all('h3') hemi_list = [] link_list = [] for h in hemispheres: hemi_text = h.text.strip('Enhanced') hemi_list.append(hemi_text) #Click on Hemisphere Link try: browser.click_link_by_partial_text(hemi_text) except: print("Scraping Complete") #Find image link link = browser.find_link_by_text('Original').first['href'] link_list.append(link) #Go Back a Page browser.visit(url) #Hard coding values scraped earlier when the site was working hemi_list = [ 'Cerberus Hemisphere ', 'Schiaparelli Hemisphere ', 'Syrtis Major Hemisphere ', 'Valles Marineris Hemisphere ' ] link_list = [ 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg', 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg', 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg', 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg' ] #WRITE ALL TO DICT mars_dict = { "article_title": title, "article_excerpt": article, "feature_image": featured_image_url, "fact_table": html_table, "mars_weather": tweet, "hemisphere_list": hemi_list, "hemisphere_pic": link_list } return (mars_dict)