def mars_Images(): from splinter import Browser from bs4 import BeautifulSoup image_dict = {} browser = Browser('chrome', headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) time.sleep(3) browser.click_link_by_id("full_image") elem = browser.find_link_by_partial_href("PIA") image_url = elem['href'] browser.quit() browser2 = Browser('chrome', headless=False) url2 = image_url browser2.visit(url2) browser2 = Browser('chrome', headless=False) url2 = image_url browser2.visit(url2) elem = browser2.find_link_by_partial_href("/spaceimages/images") featured_image_url = elem['href'] image_dict["featured_image_url"] = featured_image_url return image_dict
def setHtml(roomDay, config): b = Browser(driver_name='chrome') url = htmlDir b.visit(url) b.select('selLocation', config['room']) b.fill('textDescription', config['text']) b.select('selStartMonth', roomDay.month) b.select('selStartDay', roomDay.day) b.select('selStartTime', config['startTime']) b.select('selEndTime', config['endTime']) b.click_link_by_id('submit')
class StudentSplinterTestCase(ChannelsLiveServerTestCase): fixtures = ['workshops', "languages", "problems", "problem_tests"] def setUp(self): self.browser = Browser('chrome', headless=True) self.browser.visit(f'{self.live_server_url}{reverse("workshop_auth")}') self.workshop = Workshop.objects.get(pk=1) self.workshop.start() self.browser.fill('code', self.workshop.code) self.browser.click_link_by_id("submit-id-submit") def tearDown(self): self.browser.quit()
def ebay_kleinanzeigen(login_name, login_pw, title, pic_path, description, price, plz, street, company, phone): url = "https://www.ebay-kleinanzeigen.de/p-anzeige-aufgeben.html#?path=210/306/teile&isParent=false" browser = Browser('chrome') browser.driver.set_window_size(1200, 900) browser.visit(url) browser.fill('loginMail', login_name) browser.fill('password', login_pw) browser.click_link_by_id("login-submit") browser.find_by_id("cat_210").click() browser.find_by_id("cat_306").click() browser.find_by_id("cat_teile").click() browser.find_by_css('.button').first.click() browser.fill('title', title) browser.fill('description', description) browser.fill('priceAmount', price) browser.find_by_id("priceType2").click() browser.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight/4);") browser.find_by_id('pictureupload-pickfiles').click() time.sleep(2) apps = pywinauto.findwindows.find_elements(title_re='Öffnen') for app in apps: print(app) prozess = re.search('.+#([0-9]+)', str(app)) prozess = int(prozess.group(1)) print(prozess) app = pywinauto.Application().connect(title='Öffnen') # app = pywinauto.Application().connect(process=prozess) window = app.Dialog window.Wait('ready') edit = window.Edit edit.ClickInput() edit.TypeKeys(pic_path) button = window.Button button.Click() time.sleep(10) browser.fill('zipCode', plz) browser.fill('streetName', street) browser.fill('contactName', company) browser.fill('phoneNumber', phone) browser.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") browser.find_by_id('pstad-submit').click() time.sleep(10) browser.quit()
def get_url_code(auth_url, username, password, login='******'): b = Browser(driver_name='chrome') b.visit(auth_url) b.click_link_by_partial_href("/en/login") if login == 'facebook': b.click_link_by_partial_href("https://www.facebook.com") b.fill_form({'email': username, 'pass': password}) b.click_link_by_id('loginbutton') elif login == 'spotify': b.fill_form({'username': username, 'password': password}) loginbutton = b.find_by_text('Log In')[0] loginbutton.click() b.visit(auth_url) codeurl = b.url code = codeurl.split("?code=")[1].split('&')[0] b.quit() return code
def scrape(): #set up Browser executable_path = {'executable_path': "chromedriver"} browser = Browser('chrome', **executable_path, headless=False) #Get Nasa News nasa_news = 'https://mars.nasa.gov/news/' browser.visit(nasa_news) html = browser.html soup = bs(html, 'html.parser') results = soup.find_all('li', class_="slide") for result in results[0]: news_title = result.find('div',class_="content_title").text news_description = result.find('div',class_="article_teaser_body").text news_url = nasa_news + result.a['href'] time.sleep(1) #Collect JPL Image jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl) html = browser.html soup = bs(html, 'html.parser') browser.click_link_by_id('full_image') time.sleep(2) browser.click_link_by_partial_href('/spaceimages/details') soup = bs(browser.html, 'html.parser') results = soup.find('figure', class_ = 'lede') base_url = browser.url[:24] img = results.a.img['src'] featured_img_url = base_url + img time.sleep(1) #Mars Weather weather = 'https://twitter.com/marswxreport?lang=en' browser.visit(weather) html = browser.html soup = bs(html, 'html.parser') results = soup.find('div', class_="js-tweet-text-container") try: results.a.decompose() except: pass mars_weather = results.find('p').text time.sleep(1) #Mars Facts space_facts = 'https://space-facts.com/mars/' mars_facts = pd.read_html(space_facts)[1].rename(columns = {0:'Fact',1:'Data'}).to_html(index=False).replace('\n','') time.sleep(1) #Mars Hemispheres hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemispheres) html = browser.html soup = bs(html, 'html.parser') #Find list of image tags base_url = browser.url[:29] results = soup.find_all('div',attrs={'class':'collapsible results'})[0] images = results.find_all('div')[:] #iterate through length of tags and collect hrefs, navigate to page and collect full image link hemisphere_image_urls = [] for image in range(0,len(images)): if image == 0 or image % 2 == 0: url = base_url+images[image].a['href'] title = (images[image].h3.text) browser.visit(url) time.sleep(1) soup = bs(browser.html,'html.parser') results = soup.find_all('ul')[0] result = results.find_all('li')[0] hemi_url = (result.a['href']) hemisphere_image_urls.append({'title':title, 'img_url':hemi_url}) facts = {'news_title':news_title, 'news_description':news_description, 'news_url':news_url, 'featured_img_url':featured_img_url, 'mars_weather':mars_weather, 'mars_facts':mars_facts, 'hemi_img_url':hemisphere_image_urls } browser.visit('https://i.pinimg.com/originals/49/78/3e/49783e18b9ac11c560362029ba1f3328.jpg') return facts
def scrape(): url = 'https://mars.nasa.gov/news/' response = req.get(url) soup = BeautifulSoup(response.text, 'lxml') title = soup.find("div", class_="content_title").text description = soup.find("div", class_="rollover_description_inner").text browser = Browser('chrome', headless=False) img_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(img_url) html = browser.html soup = BeautifulSoup(html, "html.parser") browser.click_link_by_id('full_image') browser.click_link_by_partial_text('more info') image_html = browser.html soup2 = BeautifulSoup(image_html, 'html.parser') main_img_url = soup2.find('img', class_='main_image') split_img_url = main_img_url.get('src') featured_image_url = "https://www.jpl.nasa.gov" + split_img_url mars_twitter = "https://twitter.com/marswxreport?lang=en" browser.visit(mars_twitter) html = browser.html twitter_soup = BeautifulSoup(html, 'html.parser') mars_tweet = twitter_soup.find('div', class_="js-tweet-text-container") mars_weather = mars_tweet.find('p', 'tweet-text').get_text() #mars_weather facts_url = "https://space-facts.com/mars/" tables = pd.read_html(facts_url) #tables mars_df = tables[0] mars_df.columns = ['Mars Facts', 'Mars Data'] #mars_df mars_df.set_index('Mars Facts', inplace=True) #mars_df html_table = mars_df.to_html() mars_df.to_html('table.html') # Visit the USGS Astrogeology site to obtain for obtain high resolution images for each of Mar's hemispheres usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" usgs_req = req.get(usgs_url) # You will need to click each of the links to the hemispheres in order to find full res image soup = BeautifulSoup(usgs_req.text, "html.parser") hemi_attributes_list = soup.find_all('a', class_="itemLink product-item") # Save both the image url string using the keys img_url and title. # Append the dictionary with the image url string and the hemisphere title to a list. hemisphere_image_urls = [] for hemi_img in hemi_attributes_list: img_title = hemi_img.find('h3').text link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href'] img_request = req.get(link_to_img) soup = BeautifulSoup(img_request.text, 'lxml') img_tag = soup.find('div', class_='downloads') img_url = img_tag.find('a')['href'] hemisphere_image_urls.append({ "Title": img_title, "Image_Url": img_url }) mars_data = { "News_Title": title, "Paragraph_Text": description, "Most_Recent_Mars_Image": featured_image_url, "Mars_Weather": mars_weather, "mars_h": hemisphere_image_urls } return mars_data
def scrape(): # Initialize Splinter for Windows Users executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #--------NASA Mars News --------- # URL of page to be scraped NASA_News_url = 'https://mars.nasa.gov/news/' browser.visit(NASA_News_url) news_title = NASA_Mars_News_soup.find('div', class_='content_title').text #This is not consistently running correct news_p = NASA_Mars_News_soup.find('div', class_='article_teaser_body').text #---------JPL Mars Space Images - Featured Image---------- # URL of page to visit JPL_Mars_Images_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(JPL_Mars_Images_url) time.sleep(5) # Click on first button browser.click_link_by_id('full_image') time.sleep(5) # Click again browser.click_link_by_partial_text('more info') time.sleep(5) # Create Soup Object from the JPL Mars Space Images URL html = browser.html JPL_Mars_Images_soup = bs(html, 'html.parser') # find the relative image url relative_img_url = JPL_Mars_Images_soup.select_one('figure.lede a img') #I found that the execution of this code is unreliable unless I put in some delays. Maybe I am wrong. time.sleep(10) figure = JPL_Mars_Images_soup.find('figure') relative_mars_image_url = figure.find('a')['href'] mars_image_url = f'https://www.jpl.nasa.gov{relative_mars_image_url}' #-------------Mars Facts --------------------- #Read Mars Facts into dataframe --- returns a list of DataFrame objects Mars_Facts_df = pd.read_html('https://space-facts.com/mars/')[0] #Set Column Names and set index to description column Mars_Facts_df.columns = ['description', 'value'] Mars_Facts_df.set_index('description', inplace=True) Mars_Facts = Mars_Facts_df.to_html('Mars_Facts_table.html') #--------------- Mars Hemispheres ------------------------ # URL of page to be scraped Mars_Hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(Mars_Hemisphere_url) # Create Soup Object from the Mars Hemisphere Web Page html = browser.html Mars_Hemisphere_soup = bs(html, 'html.parser') # Retreive all items that contain mars hemispheres information hemisphere_items = Mars_Hemisphere_soup.find_all('div', class_='item') # Create an empty list to store scraped hemisphere urls hemisphere_image_urls = [] # Variable for the main web site hemispheres_main_url = 'https://astrogeology.usgs.gov' # Loop through the hemisphere items for i in hemisphere_items: # Store hemisphere title hemisphere_title = i.find('h3').text # Store link that leads to full image website partial_img_url = i.find('a', class_='itemLink product-item')['href'] # Visit the complete image url browser.visit(hemispheres_main_url + partial_img_url) # Create HTML Object of individual hemisphere image information partial_img_html = browser.html # Create soup object using partial image url partial_img_soup = bs(partial_img_html, 'html.parser') # Create complete image source url complete_img_url = hemispheres_main_url + partial_img_soup.find( 'img', class_='wide-image')['src'] # Add title/image url dictionaries to hemisphere_image_urls list hemisphere_image_urls.append({ "title": hemisphere_title, "img_url": complete_img_url }) mars_information = { 'news_title': news_title, 'news_paragraph': news_p, 'mars_image_url': mars_image_url, 'mars_facts_html': Mars_Facts, 'mars_hemispheres': hemisphere_image_urls } #print(scrape()) return mars_information
def scrape(): executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) news_url = "https://mars.nasa.gov/news/" browser.visit(news_url) if browser.is_element_present_by_tag('li', wait_time=10): soup = BeautifulSoup(browser.html, 'html.parser') html = browser.html soup = BeautifulSoup(html, 'html.parser') section = soup.find('li', class_="slide") #print(section.prettify()) news_title = section.find('div', class_="content_title").text #results_head = soup.find_all('div', class_="article_teaser_body") #print(news_title) #print(results_head) news_head = section.find('div', class_='article_teaser_body').text #print(news_head) images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(images_url) if browser.is_element_present_by_id('full_image', wait_time=10): soup = BeautifulSoup(browser.html, 'html.parser') browser.click_link_by_id("full_image") time.sleep(2) browser.click_link_by_partial_text("more info") html = browser.html soup = BeautifulSoup(html, 'html.parser') image_tags = soup.select_one('figure.lede a img').get("src") #print(image_tags) featured_image_url = "https://www.jpl.nasa.gov" + image_tags #print(featured_image_url) facts_url = "https://space-facts.com/mars/" tables = pd.read_html(facts_url) len(tables) print(type(tables)) print(type(tables[0])) df = tables[1] df.head() html_table = df.to_html() print(html_table) df.to_html('table.html') hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemis_url) browser.click_link_by_partial_text("Cerberus") html = browser.html soup2 = BeautifulSoup(html, 'html.parser') hemis_title = soup2.select_one("div.content h2").text #print(hemis_title) cerberus_hemis_url = browser.find_by_text("Sample")["href"] print(cerberus_hemis_url) print(f"{hemis_title}: {cerberus_hemis_url}") browser.click_link_by_partial_text("Schiaparelli") html = browser.html soup3 = BeautifulSoup(html, 'html.parser') hemis_title_2 = soup3.select_one("div.content h2").text print(hemis_title_2) schiaparelli_hemis_url = browser.find_by_text("Sample")["href"] print(schiaparelli_hemis_url) print(f"{hemis_title_2}: {schiaparelli_hemis_url}") browser.click_link_by_partial_text("Syrtis") html = browser.html soup4 = BeautifulSoup(html, 'html.parser') hemis_title_3 = soup4.select_one("div.content h2").text print(hemis_title_3) syrtis_hemis_url = browser.find_by_text("Sample")["href"] print(f"{hemis_title_3}: {syrtis_hemis_url}") browser.click_link_by_partial_text("Valles") html = browser.html soup5 = BeautifulSoup(html, 'html.parser') hemis_title_4 = soup5.select_one("div.content h2").text #print(hemis_title_4) marineris_hemis_url = browser.find_by_text("Sample")["href"] print(f"{hemis_title_4}: {marineris_hemis_url}") return_dict = { "news_title": news_title, "news_head": news_head, "featured_img": featured_image_url, "table": html_table, "Hemisphere_urls": [ hemis_url, schiaparelli_hemis_url, syrtis_hemis_url, marineris_hemis_url ] } #print(f"{hemis_title}: {cerberus_hemis_url},\n{hemis_title_2}: {schiaparelli_hemis_url},\n{hemis_title_3}: {syrtis_hemis_url},\n{hemis_title_4}: {marineris_hemis_url}") browser.quit() return return_dict
def mars_scrape(): mars = {} executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=True) url = 'https://mars.nasa.gov/news/' browser.visit(url) sleep(1) ourwebpage = browser.html soup = bs(ourwebpage, 'html.parser') x = soup.body.find_all(class_="content_title") alltitle = [] for i in x[1:]: alltitle.append(i.find('a').text.strip()) alltitle = alltitle[0] mars['title'] = alltitle paragraph = soup.body.find_all(class_="article_teaser_body") news_p = [] for i in paragraph: #print(i.text) news_p.append(i.text) news_p = news_p[0] mars['news_paragraph'] = news_p url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) sleep(1) browser.click_link_by_id('full_image') z = browser.find_link_by_partial_text("more info") z.click() sleep(1) imgwebpage = browser.html soup2 = bs(imgwebpage, 'html.parser') image_path = soup2.find(class_="main_image")['src'] image_full_path = "https://www.jpl.nasa.gov" + image_path mars["feature_img"] = image_full_path mars_table = pd.read_html("https://space-facts.com/mars/")[0] mars_table.rename(columns={0: "Category", 1: "Value"}, inplace=True) mars["mars_table"] = mars_table url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) sleep(1) image = [] for i in range(4): browser.find_by_css("a.product-item h3")[i].click() sleep(1) html_image = browser.html soupitem = bs(html_image, 'html.parser') zz = soupitem.find('a', text="Sample") image.append(zz['href']) browser.back() hemisphere_image_urls = [ { "title": "Valles Marineris Hemisphere", "img_url": image[0] }, { "title": "Cerberus Hemisphere", "img_url": image[1] }, { "title": "Schiaparelli Hemisphere", "img_url": image[2] }, { "title": "Syrtis Major Hemisphere", "img_url": image[3] }, ] mars["mars_image"] = hemisphere_image_urls return mars
browser = Browser('chrome', **executable_path) ##chrome #browser = Browser(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 11_1 like Mac OS X) AppleWebKit/604.2.8 (KHTML, like Gecko) Version/11.0 Mobile/15B57 Safari/604.1", **executable_path) ## iPhone #browser = Browser('firefox', **executable_path) ##firefox #Test Case 1 print("=========================================================") print("Running Test Case 1: Upload photo for Classification") browser.visit('http://54.191.193.7:5000/') print("Visiting browser...") time.sleep(2) element = browser.driver.find_element_by_id("imageFile") pathToImage = os.path.abspath("static/testing/Capture5.JPG") element.send_keys(pathToImage) print("Image chosen...") time.sleep(2) browser.click_link_by_id('submit') print("Image submitted for classification...") time.sleep(2) assert browser.is_text_present('Image Uploaded') == True print("=========================================================") #Test Case 2 print("Running Test Case 2: Upload nothing and try to submit for classification") browser.visit('http://54.191.193.7:5000/') print("Visiting browser...") time.sleep(2) browser.click_link_by_id('submit') print("Submit button pressed...") time.sleep(2) assert browser.is_text_present('Please choose an image!') == True print("=========================================================")
def scrape(): import pandas as pd from bs4 import BeautifulSoup as bs from splinter import Browser import requests import re import nbconvert import time # In[76]: # URL of page to be scraped url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" #Retrieve page with requests response = requests.get(url) # Parse the response with beautifulsoup soup = bs(response.text, 'html.parser') # Pretty Print the html #print(soup.prettify()) # In[77]: #Store and Print the news title news_title = soup.find('div',class_='content_title').text #print(news_title) # In[78]: # Navigate and Parse the article URL p_url = "https://mars.nasa.gov/news/8719/nasa-invites-public-to-share-excitement-of-mars-2020-perseverance-rover-launch/" response = requests.get(p_url) soup = bs(response.text,'lxml') # In[79]: # Store the first paragraph in news_p results = soup.find_all('p') paragraphs = [] for result in results: paragraphs.append(result) news_p = paragraphs[2].text #print(news_p) # In[80]: # Activate splinter executable_path = {'executable_path': r'C:\Users\nvora\AppData\Roaming\chromedriver_win32\chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # In[35]: # Open URL in splinter splinter_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(splinter_url) # In[36]: # Click Full Image link browser.click_link_by_id('full_image') # In[37]: # Navigate to more info to find the full size image browser.click_link_by_partial_text('more info') # In[38]: # Extract the full-size image url featured_image_url = browser.find_by_css('.main_image')[0]['src'] browser.quit() # In[60]: # Connect Browser to Twitter URL (Splinter) executable_path = {'executable_path': r'C:\Users\nvora\AppData\Roaming\chromedriver_win32\chromedriver'} browser = Browser('chrome', **executable_path, headless=False) twitter_url = 'https://twitter.com/MarsWxReport' browser.visit(twitter_url) time.sleep(5) # In[61]: # Scrape Mars HTML html = browser.html soup = bs(html,'html.parser') #print(soup.prettify()) # In[63]: #Scrape Mars Tweet text and store in variable mars_weather = soup.find_all('span',class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0")[27].text browser.quit() # In[1]: # Pandas Scraping Mars Facts url = 'https://space-facts.com/mars/' # In[7]: # Needed to run conda install -c conda-forge html5lib on the PythonData kernel to get this working # Read URL HTML into tables tables = pd.read_html(url) tables[0] # In[ ]: mars_facts = tables[0].to_html() # In[65]: # Splinter scrape image urls executable_path = {'executable_path': r'C:\Users\nvora\AppData\Roaming\chromedriver_win32\chromedriver'} browser = Browser('chrome', **executable_path, headless=False) mars_images_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(mars_images_url) # In[67]: # Click first link URL and store image URL and title browser.find_by_css('h3')[0].click() cerberus_image = browser.find_by_text('Sample')['href'] cerberus_title = browser.find_by_css('.title').text #Return to main page browser.back() # In[68]: # Click second link URL and store image URL and title browser.find_by_css('h3')[1].click() schiaparelli_image = browser.find_by_text('Sample')['href'] schiaparelli_title = browser.find_by_css('.title').text browser.back() # In[69]: # Click second link URL and store image URL and title browser.find_by_css('h3')[2].click() syrtis_major_image = browser.find_by_text('Sample')['href'] syrtis_major_title = browser.find_by_css('.title').text browser.back() # In[70]: # Click second link URL and store image URL and title browser.find_by_css('h3')[3].click() valles_marineris_image = browser.find_by_text('Sample')['href'] valles_marineris_title = browser.find_by_css('.title').text browser.quit() # In[71]: # Append the images to a dictionary hemisphere_images = [ {"title": cerberus_title, "img_url": cerberus_image}, {"title": schiaparelli_title, "img_url": schiaparelli_image}, {"title": syrtis_major_title, "img_url": syrtis_major_image}, {"title": valles_marineris_title, "img_url": valles_marineris_image} ] # print(hemisphere_images) mars_dict = {'mars_news':news_title,'news_summary':news_p,'featured_mars_image':featured_image_url,'mars_weather':mars_weather,'mars_facts':mars_facts,'mars_hemispheres':hemisphere_images} return mars_dict
def scrape(): # Visit Nasa URL through splinter and parse HTML with beautiful soup executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') # In[3]: # Scrape the site and collect the latest News Title and Paragraph Text. news = soup.find("div", class_="list_text") mars_title = news.find("div", class_="content_title").get_text() mars_p = news.find("div", class_="article_teaser_body").get_text() print(mars_title) print(mars_p) # In[4]: browser.quit() # ### JPL Mars Space Images - Featured Image # In[5]: # Visit Nasa URL through splinter and parse HTML with beautiful soup executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') # In[6]: # Navigate the site to display the full size .jpg image browser.click_link_by_id('full_image') # In[7]: button = 'more info ' time.sleep(3) browser.find_by_text(button).click() # In[8]: # Parse the new html page with beautiful soup and retrieve url string for the image html = browser.html soup = BeautifulSoup(html, 'html.parser') url = soup.find("img", class_="main_image")['src'] featured_url = 'https://www.jpl.nasa.gov' + url featured_url # In[9]: browser.quit() # ### Mars Weather # In[10]: # Visit URL through splinter and parse HTML with beautiful soup executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') # In[11]: # scrape the latest Mars weather tweet from the page. #data=soup.find("div",class_="js-tweet-text-container").get_text() mars_weather = soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text mars_weather # In[12]: # replace \n with , and display data mars_weather = ",".join(mars_weather.split("\n")) mars_weather # In[13]: browser.quit() # ### Mars Hemispheres # In[14]: # Visit URL through splinter and parse HTML with beautiful soup executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #browser = webdriver.Chrome('./chromedriver') url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') # In[15]: items = soup.find_all('div', class_='description') # Create empty list for hemisphere urls hemisphere_image_urls = [] # Store the main_ul hemispheres_main_url = 'https://astrogeology.usgs.gov' # Loop through the items previously stored for i in items: # Store title title = i.find('h3').text # Store link that leads to full image website partial_img_url = i.find('a', class_='itemLink product-item')['href'] # Visit the link that contains the full image website browser.visit(hemispheres_main_url + partial_img_url) # HTML Object of individual hemisphere information website partial_img_html = browser.html # Parse HTML with Beautiful Soup for every individual hemisphere information website soup = BeautifulSoup(partial_img_html, 'html.parser') # Retrieve full image source img_url = hemispheres_main_url + soup.find('img', class_='wide-image')['src'] # Append the retreived information into a list of dictionaries hemisphere_image_urls.append({"title": title, "img_url": img_url}) # Display hemisphere_image_urls hemisphere_image_urls #items # In[16]: browser.quit() # In[17]: # desc=soup.find_all("div",class_="description") #data=[] # for i in range(len(desc)): # name=desc[i].find("h3").get_text() # browser.find_by_text(name).click() # time.sleep(5) # #browser.find_by_text('Original').click() # #browser.find_by_text('Open').click() # #src=soup.find("img",class_="wide-image")['src'] # #load=soup.find("div",class_="downloads", href=True) # #ProdLinkElem = soup.find_all('a',target = '_blank', href = True) # html = browser.html # soup = BeautifulSoup(html, 'html.parser') # hemispheres_main_url = 'https://astrogeology.usgs.gov' # img_url = hemispheres_main_url + soup.find('img', class_='wide-image')['src'] # data.append({'title':name, # 'url':img_url}) # ### Mars Facts # In[24]: # Visit URL through splinter and parse HTML with beautiful soup executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #browser = webdriver.Chrome('./chromedriver') url = 'https://space-facts.com/mars/' browser.visit(url) #html = browser.html #soup = BeautifulSoup(html, 'html.parser') # In[25]: # use read_html function in Pandas to automatically scrape any tabular data from a page. mars_factsDf = pd.read_html(url) mars_fact = mars_factsDf[0] #mars_facts = mars_facts.to_html() #mars_facts mars_fact.columns = ['Description', 'Value', 'Value2'] mars_fact.set_index('Description', inplace=True) mars_fact = mars_fact.iloc[:, 0:1] mars_facts = mars_fact.to_html() mars_facts = mars_facts.replace('\n', '') mars_fact.to_html('table1.html') mars_fact # In[20]: browser.quit() # In[21]: # # Visit Mars facts url # #facts_url = 'http://space-facts.com/mars/' # # Use Panda's `read_html` to parse the url # mars_facts = pd.read_html(url) # # Find the mars facts DataFrame in the list of DataFrames as assign it to `mars_df` # mars_df = mars_facts[0] # # Assign the columns `['Description', 'Value']` # mars_df.columns = ['Description','Value'] # # Set the index to the `Description` column without row indexing # mars_df.set_index('Description', inplace=True) # # Save html code to folder Assets # mars_df.to_html() # data = mars_df.to_dict(orient='records') # Here's our added param.. # # Display mars_df # mars_df # In[22]: # Display a python dictionary of all scraped data # In[27]: mars_data = { "news_title": mars_title, "news_p": mars_p, "featured_url": featured_url, "mars_weather": mars_weather, "mars_facts": mars_facts, "hemisphere_image_urls": hemisphere_image_urls } return mars_data
def scrape_info(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Scraping Nasa Mars News # Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) # and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later. source = requests.get('https://mars.nasa.gov/news/').text soup = bs(source, 'html.parser') article = soup.find_all('div', class_='content_title') news_title0 = article[0].a.text news_title1 = article[1].a.text news_title2 = article[2].a.text description = soup.find_all('div', class_="rollover_description_inner") news_p0 = description[0].text news_p1 = description[1].text news_p2 = description[2].text # Scraping JPL Mars Space Images - Featured Image # Return featured_img_url #Visit the url for JPL Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars). #Use splinter to navigate the site url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) try: browser.click_link_by_id('full_image') except: browser.click_link_by_partial_text('FULL IMAGE') else: print("Scraping Full Image Complete") check = 0 try: links_found = browser.find_link_by_partial_href('spaceimages/details') url2 = links_found[0]["href"] browser.click_link_by_partial_text('more info') links_found2 = browser.find_link_by_partial_href( 'spaceimages/images/largesize') f1 = links_found2[0]["href"] check = 1 except: browser.visit(url2) links_found3 = browser.find_link_by_partial_href( 'spaceimages/images/largesize') f2 = links_found3[0]["href"] else: print("Scraping More Info Complete") if check == 1: featured_image_url = f1 else: featured_image_url = f2 # Mars Weather # Returns (mars_weather) #Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en) #and scrape the latest Mars weather tweet from the page. #Save the tweet text for the weather report as a variable called `mars_weather` source3 = requests.get('https://twitter.com/marswxreport?lang=en').text soup = bs(source3, 'html.parser') tweets = soup.find_all( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text") #print(tweets[0].text) mars_weather = tweets[0].text #Mars Facts # Returns (mars_facts_table) facts = pd.read_html("https://space-facts.com/mars/") mars_facts_df = facts[1] mars_facts_df.columns = ['Description', 'Value'] mars_facts_df.set_index("Description", inplace=True) mars_facts_df.head() mars_facts_table = mars_facts_df.to_html() mars_facts_table = mars_facts_table.replace('\n', '') # Mars Hemispheres # Returns hemisphere_image_urls hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemi) html = browser.html # Get titles for all four mars pictures soup = bs(html, 'html.parser') hemi_class = soup.find_all('h3') cerberus_title = hemi_class[0].text schiaparelli_title = hemi_class[1].text syrtis_title = hemi_class[2].text valles_title = hemi_class[3].text # Get Cerberus Information browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') link1 = (browser.find_link_by_partial_text('Original')) cerberus_link = (link1[0]["href"]) + "/full.jpg" browser.back() # Get Schiaparelli Information browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') link2 = (browser.find_link_by_partial_text('Original')) schiaparelli_link = (link2[0]["href"]) + "/full.jpg" browser.back() # Get Syrtis Major Information browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') link3 = (browser.find_link_by_partial_text('Original')) syrtis_link = (link3[0]["href"]) + "/full.jpg" browser.back() # Get Valles Major Information browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced') link4 = (browser.find_link_by_partial_text('Original')) valles_link = (link4[0]["href"]) + "/full.jpg" browser.back() marsdata = { "news_title0": news_title0, "description0": news_p0, "news_title1": news_title1, "description1": news_p1, "news_title2": news_title2, "description2": news_p2, "JPL_link": featured_image_url, "weather_tweet": mars_weather, "facts_table": mars_facts_table, "title1": cerberus_title, "img_url1": cerberus_link, "title2": schiaparelli_title, "img_url2": schiaparelli_link, "title3": syrtis_title, "img_url3": syrtis_link, "title4": valles_title, "img_url4": valles_link } # Close the browser after scraping browser.quit() # Return results return marsdata
class tickets(object): # 用户名,密码 username = None passwd = None # 起始站和终点站 starts = None ends = None # 时间格式2019-02-20 dtime = None # 车次 order = None # 乘客名 passenger = None # 席位 seatType = None # 是否学生票 isStudent = False # 刷新周期 refresh_period = 5 # 座位类型对照表 __seatTypeList=[None,"YZ_",None,"YW_"] """网址""" ticket_url = "https://kyfw.12306.cn/otn/leftTicket/init" login_url = "https://kyfw.12306.cn/otn/resources/login.html" initmy_url = "https://kyfw.12306.cn/otn/view/index.html" buy_url="https://kyfw.12306.cn/otn/confirmPassenger/initDc" def __init__(self): self.driver_name='firefox' def login(self): self.driver.visit(self.login_url) sleep(1) # 填充密码 self.driver.find_by_text("账号登录")[0].click() self.driver.find_by_id("J-userName")[0].fill(self.username) self.driver.find_by_id("J-password")[0].fill(self.passwd) print("等待验证码,自行输入...") while True: if self.driver.url != self.initmy_url: sleep(1) else: break def start(self): # 检测参数是否为空 if not self.username or not self.passwd or not self.starts or not self.ends \ or not self.dtime or not self.order or not self.passenger or not self.seatType: print("请初始化参数username,passwd,starts,ends,dtime,order,passenger,seatType...") return # 打开浏览器 self.driver=Browser(driver_name=self.driver_name) # 登陆 self.login() # 开始抢票 self.driver.visit(self.ticket_url) print("购票页面开始...") # 修改cookie self.driver.cookies.add({"_jc_save_fromStation": self.starts}) self.driver.cookies.add({"_jc_save_toStation": self.ends}) self.driver.cookies.add({"_jc_save_fromDate": self.dtime}) # 重新载入 self.driver.reload() while self.driver.url==self.ticket_url: # 点击查询按钮 try: sleep(self.refresh_period) if self.driver.is_element_present_by_id("query_ticket",5): self.driver.click_link_by_id("query_ticket") if self.driver.is_element_present_by_id("ticket_"+self.order,3): train_info=self.driver.find_by_id("ticket_"+self.order) train_seat=train_info.find_by_id(self.__seatTypeList[self.seatType]+self.order)[0] if train_seat.text!="无" and train_seat.text!="--": print("有票,准备预订...") train_info.find_by_text("预订")[0].click() else: self.driver.reload() except: self.driver.reload() print('开始选择用户...') # 等待加载 if self.driver.is_element_present_by_id("normal_passenger_id",5): if self.driver.is_element_present_by_text(self.passenger,5): psg_list=self.driver.find_by_id("normal_passenger_id") psg_list.find_by_text(self.passenger)[0].click() if "学生" in self.passenger: self.driver.is_element_present_by_id("dialog_xsertcj_ok",5) self.driver.find_by_id("dialog_xsertcj_ok")[0].click() print("开始选座...") # 3:硬卧,1:硬座 self.driver.is_element_present_by_id("seatType_1",5) # select要求元素必须有name属性,由于12306只指定了选项的id # 我们模仿select函数的定义,构造select_by_id self.driver.find_by_xpath( '//select[@id="%s"]//option[@value="%s"]' % ("seatType_1",str(self.seatType)) ).first._element.click() print('提交订单...') self.driver.is_element_present_by_id("submitOrder_id",5) #self.driver.find_by_id('submitOrder_id').click() #self.driver.is_element_present_by_id("qr_submit_id",5) #self.driver.find_by_id('qr_submit_id').click() else: print('找不到text为\"%s\"的元素,程序即将退出...'%self.passenger)
# for newPos in boardPositions: # boardLayout = boardLayout.replace(newPos, pos[len(pos)-2:]) # for ph in placeholders: # boardLayout = boardLayout.replace(ph, " ") # print(boardLayout) entryURL = "https://www.chess.com/login" secondaryURL = "https://www.chess.com/tactics" browser = Browser() browser.visit(entryURL) browser.driver.set_window_position(0, 0, windowHandle='current') browser.driver.set_window_size(1920, 1080, windowHandle='current') browser.find_by_id("username").first.fill(username) browser.find_by_id("password").first.fill(password) browser.click_link_by_id("login") browser.visit(secondaryURL) startBtn = browser.driver.find_element_by_css_selector( "#sidebar .tactics-sidebar .btn-primary.btn-start") startBtn.click() styles = [] locations = [] lastMove = [] lastCapture = "" time.sleep(3) elements = browser.driver.find_element_by_id( "chess_com_tactics_board_boardarea").find_elements_by_tag_name("img") lastMovePositions = browser.driver.find_element_by_id( "chess_com_tactics_board_boardarea").find_elements_by_tag_name("div") index = lastMovePositions[len(lastMovePositions) - 1].get_attribute("style").find("translate(")
def scrape(): # Dependencies from splinter import Browser from bs4 import BeautifulSoup import requests import pandas as pd import pymongo import time import ctypes # An included library with Python install. def Mbox(title, text, style): return ctypes.windll.user32.MessageBoxW(0, text, title, style) mars_data_dict = {} ## (1) NASA Mars News # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. # Assign the text to variables that you can reference later. # URL of page to be scraped url_nz = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module response_nz = requests.get(url_nz) # Create BeautifulSoup object; parse with 'html.parser' soup_nz = BeautifulSoup(response_nz.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_nz.prettify()) #time.sleep(2) # Find the latest News Title news_title = soup_nz.find("div", class_="content_title").a.text[1:-1] #print(news_title) # Find the latest News Paragraph Text news_p = soup_nz.find("div", class_="image_and_description_container").a.text[3:-7] #print(news_p) mars_data_dict["news_title"] = news_title mars_data_dict["news_p"] = news_p ## (2) JPL Mars Space Images - Featured Image # Use splinter to navigate the site and find the image url for the current Featured Mars Image # and assign the url string to a variable called featured_image_url. # Make sure to find the image url to the full size .jpg image. # Make sure to save a complete url string for this image. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) time.sleep(2) #dir(browser) browser.click_link_by_id('full_image') time.sleep(2) browser.click_link_by_partial_href("/spaceimages/details.") time.sleep(2) browser.click_link_by_partial_href("/spaceimages/images/largesize") time.sleep(2) featured_image_url = browser.url #print(featured_image_url) mars_data_dict["feat_img"] = featured_image_url browser.quit() ## (3) Mars Weather # Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. # Save the tweet text for the weather report as a variable called mars_weather. # URL of page to be scraped url_tweet = 'https://twitter.com/marswxreport?lang=en' # Retrieve page with the requests module response_tweet = requests.get(url_tweet) # Create BeautifulSoup object; parse with 'html.parser' soup_tweet = BeautifulSoup(response_tweet.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_tweet.prettify()) #time.sleep(2) # scrape the latest Mars weather tweet from the page tweets = soup_tweet.find_all("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text") for tweet in tweets: find_text = tweet.text.find("InSight sol") if find_text == 0: mars_weather = tweet.text #print(mars_weather) break mars_data_dict["weather"] = mars_weather ## (4) Mars Facts # URL of page to be scraped url_mfacts = 'https://space-facts.com/mars/' # Retrieve page with the requests module response_mfacts = requests.get(url_mfacts) # Create BeautifulSoup object; parse with 'html.parser' soup_mfacts = BeautifulSoup(response_mfacts.text, 'lxml') # Examine the results, then determine element that contains sought info #print(soup_mfacts.prettify()) #time.sleep(2) tables = pd.read_html(url_mfacts)[1] #tables mars_data_dict["mfacts"] = tables tables.to_html("../html/mars_facts.html") ## (5) Mars Hemispheres # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres. # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. # Save both the image url string for the full resolution hemisphere image, # and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the # keys img_url and title. # Append the dictionary with the image url string and the hemisphere title to a list. # This list will contain one dictionary for each hemisphere executable_path = {"executable_path": "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False) # URL of page to be scraped url_mhemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_mhemi) time.sleep(2) # Image 1 browser.click_link_by_partial_text("Cerberus Hemisphere Enhanced") time.sleep(2) title1 = browser.title.split("|")[0] #print(title1) browser.click_link_by_text("Sample") time.sleep(2) img1_url = browser.windows[1].url #print(img1_url) time.sleep(2) browser.windows[1].close() browser.back() hemi1_dict = {} hemi1_dict["title"] = title1 hemi1_dict["img_url"] = img1_url #hemi1_dict # Image 2 browser.click_link_by_partial_text("Schiaparelli Hemisphere Enhanced") time.sleep(2) title2 = browser.title.split("|")[0] #print(title2) browser.click_link_by_text("Sample") time.sleep(2) img2_url = browser.windows[1].url #print(img2_url) time.sleep(2) browser.windows[1].close() browser.back() hemi2_dict = {} hemi2_dict["title"] = title2 hemi2_dict["img_url"] = img2_url #hemi2_dict # Image 3 browser.click_link_by_partial_text("Syrtis Major Hemisphere Enhanced") time.sleep(2) title3 = browser.title.split("|")[0] #print(title3) browser.click_link_by_text("Sample") time.sleep(2) img3_url = browser.windows[1].url #print(img3_url) time.sleep(2) browser.windows[1].close() browser.back() hemi3_dict = {} hemi3_dict["title"] = title3 hemi3_dict["img_url"] = img3_url #hemi3_dict # Image 4 browser.click_link_by_partial_text("Valles Marineris Hemisphere Enhanced") time.sleep(2) title4 = browser.title.split("|")[0] #print(title4) browser.click_link_by_text("Sample") time.sleep(2) img4_url = browser.windows[1].url #print(img4_url) time.sleep(2) browser.windows[1].close() browser.back() hemi4_dict = {} hemi4_dict["title"] = title4 hemi4_dict["img_url"] = img4_url #hemi4_dict hemisphere_image_urls = [hemi1_dict, hemi2_dict, hemi3_dict, hemi4_dict] #hemisphere_image_urls mars_data_dict["hemi_img"] = hemisphere_image_urls mars_data_dict browser.quit() Mbox("Mission to Mars Completed", "Congratulations!!! You've mined Mars!", 1)
def scrape_info(): browser = init_browser() # URL of pages to be scraped # NASA Mars News | NASA Mars News Site url_nasa_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' # JPL Mars Space Images - Featured Image url_jpl_images = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' # Mars Weather | Twitter ( return entry with InSight sol ) url_twitter_weather = 'https://twitter.com/marswxreport?lang=en' # Mars Facts | Space Facts url_sf_facts = 'https://space-facts.com/mars/' # Mars Hemispheres | USGS Astrogeology site # url_USGS_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' # Error 404 We couldn’t find the page. If you think you are seeing this in error please contact us. You can also try searching for the missing page. # Temp url_USGS_hemispheres = 'http://www.labellelube.com/mars.html' executable_path = {"executable_path":"chromedriver"} browser = Browser("chrome", **executable_path, headless = False) browser.visit(url_nasa_news) time.sleep(1) html = browser.html soup_news = BeautifulSoup(html, "html.parser") time.sleep(1) # Examine results determine div that contains info news_title = soup_news.find("div",class_="content_title").text news_paragraph = soup_news.find("div", class_="article_teaser_body").text # Display scrapped data print(news_title) print(news_paragraph) browser.quit() time.sleep(1) # JPL Mars Space Images - Featured Image # http://localhost:8888/notebooks/003-99-EXTRA_Stu_Scrape_Mars/Stu_Scrape_Mars.ipynb executable_path = {"executable_path":"chromedriver"} browser = Browser("chrome", **executable_path, headless = False) browser.visit(url_jpl_images) time.sleep(1) # https://splinter.readthedocs.io/en/latest/elements-in-the-page.html # https://stackoverflow.com/questions/29773368/splinter-how-to-click-a-link-button-implemented-as-a-div-or-span-element browser.click_link_by_id('full_image') time.sleep(1) # https://splinter.readthedocs.io/en/latest/elements-in-the-page.html # browser.click_link_by_text('more info') browser.click_link_by_partial_text('more info') time.sleep(1) #jpl_image = soup_jpl.select_one("article figure.lede a img").get("src") html = browser.html soup_jpl = BeautifulSoup(html, "html.parser") time.sleep(1) jpl_image = soup_jpl.find('figure', class_='lede').a['href'] url_root = "https://www.jpl.nasa.gov/" time.sleep(1) jpl_image_absolute = url_root+jpl_image print(jpl_image_absolute) browser.quit() time.sleep(1) ## https://github.com/taspinar/twitterscraper ## DIDN"T WORK # http://localhost:8888/notebooks/003-99-EXTRA_Stu_Scrape_Mars/Stu_Scrape_Mars.ipynb executable_path = {"executable_path":"chromedriver"} browser = Browser("chrome", **executable_path, headless = False) browser.visit(url_twitter_weather) time.sleep(1) html = browser.html soup_jpl = BeautifulSoup(html, "html.parser") time.sleep(1) # Find all tweets tweets = soup_jpl.find_all('div', class_='js-tweet-text-container') time.sleep(1) for tweet in tweets: weather_tweet = tweet.find('p').text if 'Sol' and 'pressure' in weather_tweet: print(weather_tweet) break else: pass browser.quit() time.sleep(1) # Mars Facts | Space Facts executable_path = {"executable_path":"chromedriver"} browser = Browser("chrome", **executable_path, headless = False) browser.visit(url_sf_facts) time.sleep(1) html = browser.html soup_sf = BeautifulSoup(html, "html.parser") time.sleep(1) ### # Mars Facts | Space Facts # https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.read_html.html # https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.to_html.html df_mars_facts = pd.read_html(url_sf_facts) #df_mars_facts has earth comparison. df_mars_space_facts = df_mars_facts[1] df_mars_space_facts.columns = ['Description','Value'] df_mars_space_facts.set_index('Description', inplace=True) df_mars_space_facts.to_html() df_mars_space_facts print(df_mars_space_facts.to_html()) browser.quit() # Mars Hemispheres | USGS Astrogeology site executable_path = {"executable_path":"chromedriver"} browser = Browser("chrome", **executable_path, headless = False) browser.visit(url_USGS_hemispheres) time.sleep(2) html = browser.html soup_USGS = BeautifulSoup(html, "html.parser") items = soup_USGS.find_all('div', class_='item') USGS_images = [] url_USGS_hemispheres = 'http://www.labellelube.com/mars.html' url_root='http://www.labellelube.com/' for i in items: title = i.find('h2').text image = i.find('img')['src'] USGS_images.append({"title" : title,"link" : url_root+image}) USGS_images browser.quit() # Store data in a dictionary mars_data = { "news_title": news_title, "news_paragraph": news_paragraph, "jpl_image_absolute": jpl_image_absolute, "weather_tweet":weather_tweet, "df_mars_space_facts.to_html":df_mars_space_facts.to_html, "USGS_images":USGS_images } # Close the browser after scraping browser.quit() # Return results return mars_data
def scrape_info(): browser = init_browser() mars_info = {} browser = Browser("chrome", headless=False) url = "https://mars.nasa.gov/news/" browser.visit(url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") news_title = soup.find("div", class_="content_title").text news_p = soup.find("div", class_="article_teaser_body").text mars_info["NewsTitle"] = news_title mars_info["NewsDescription"] = news_p ### JPL Mars Space Images - Featured Image base_url = "https://www.jpl.nasa.gov" url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) time.sleep(1) browser.click_link_by_id('full_image') time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") featured_image_url1 = soup.find('img', class_="fancybox-image") featured_image_url = base_url + featured_image_url1['src'] print(featured_image_url) mars_info["FeaturedImage"] = featured_image_url ### Mars Weather from Twitter url = "https://twitter.com/marswxreport?lang=en" browser.visit(url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") mars_weather = soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text.strip() mars_info["WeatherTweet"] = mars_weather ### Mars Facts #Visit the Mars Facts webpage http://space-facts.com/mars/ and use Pandas to scrape #the table containing facts about the planet including Diameter, Mass, etc. #Use Pandas to convert the data to a HTML table string. url = "https://space-facts.com/mars/" browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, "html.parser") # tables_df_list = soup.find("table", id="tablepress-mars") tables = pd.read_html(html) # for table in tables_df_list: # html_table = table.to_html() # html_table # html_table.replace('\n', '') # table.to_html('table.html') mars_info["MarsTable"] = tables[0].to_html() # tables = pd.read_html(url) # tables # tables_df = tables[0] # tables_df # html_table = tables_df.to_html() # html_table # html_table.replace('\n', '') # tables_df.to_html('table.html') # mars_info["MarsTable"] = tables_df.to_html('table.html') ### Mars Hemispheres #Visit the USGS Astrogeology site https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars #to obtain high resolution images for each of Mar's hemispheres. #You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. #Save both the image url string for the full resolution hemisphere image, #and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using #the keys `img_url` and `title`. #Append the dictionary with the image url string and the hemisphere title to a list. #This list will contain one dictionary for each hemisphere. ### JPL Mars Space Images - Featured Image base_url1 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(base_url1) time.sleep(1) #url_new = "https://astrogeology.usgs.gov/search/map/Mars/Viking/" html = browser.html soup = BeautifulSoup(html, "html.parser") products = soup.find("div", id="product-section") link_lists = soup.find_all("div", class_="description") #link_lists = [x.find("a") for x in product_des] link_lists imagelist = [] for link_list in link_lists: linktext = link_list.h3.text #browser.execute_script("arguments[0].scrollIntoView();", link_list) browser.click_link_by_partial_text(linktext) time.sleep(1) soup2 = BeautifulSoup(browser.html, "html.parser") image_url1 = soup2.find('a', target="_blank") img_url = image_url1['href'] title = soup2.find('h2', class_="title").get_text() imagelist.append({"title": title, "img_url": img_url}) browser.back() time.sleep(1) mars_info["ImageTitle"] = title mars_info["ImageURL"] = img_url return (mars_info)
# In[4]: executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # In[5]: url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'lxml') # In[6]: #browser.click_link_by_partial_href('/spaceimages/images/mediumsize/PIA17932_ip.jpg') browser.click_link_by_id('full_image') # In[7]: html = browser.html soup = BeautifulSoup(html, 'lxml') featured_image = soup.find('img', class_='fancybox-image') featured_image_url = 'https://www.jpl.nasa.gov' + featured_image['src'] browser.quit() print(featured_image_url) # ### Mars Weather # In[8]: # Retrieve page with the requests module
def scrape(): executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) print("SAM after setting browser") #store all the scraped data in a dictionary mars_dictionary = {} url = 'https://mars.nasa.gov/news/' browser.visit(url) # In[3]: # Retrieve page with the requests module html = requests.get(url) # In[4]: # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(html.text, 'lxml') # In[5]: # news_title = soup.find('div', class_='content_title').text # print(news_title) # news_p = soup.find('div', class_='image_and_description_container').text # print(news_p) # print("title: ", news_title) # print("paragraph: ", news_p) # In[6]: #get the title container = soup.find('div', class_="content_title") news_title = container.a.text #get the paragraph description container = soup.find('div', class_="image_and_description_container") text_tot = container.find('div', class_="rollover_description_inner") news_p = text_tot.text print("title: ", news_title) print("paragraph: ", news_p) mars_dictionary["news_title"] = news_title mars_dictionary["news_para"] = news_p # In[7]: executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # In[8]: jpl_url = 'https://www.jpl.nasa.gov' url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) #use splinter to get the URL of the image # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') # In[9]: # Retrieve the article with the featured image article = soup.find('article', class_='carousel_item') # Use Beautiful Soup's find() method to navigate and retrieve attributes h1 = article.find('h1', class_='media_feature_title').text print(h1) # # Click the 'Full Image' button # browser.click_link_by_partial_text('FULL IMAGE') # In[10]: browser.click_link_by_id("full_image") # In[11]: #then click "more info" to get the full size image time.sleep(2) browser.click_link_by_partial_text('more info') # In[12]: #get the html of the new page # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') # In[13]: img_url = soup.find('img', class_="main_image")['src'] # print(img_url) # fig_url = figure['src'] # print(fig_url) # In[14]: featured_image_url = jpl_url + img_url print(featured_image_url) mars_dictionary["featured_img_title"] = h1 mars_dictionary["featured_img_url"] = featured_image_url # In[15]: #Save the tweet text for the weather report as a variable called mars_weather. url = 'https://twitter.com/marswxreport?lang=en' # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(response.text, 'html.parser') # In[16]: stream = soup.find('div', class_="tweet") print(stream) # In[17]: streams = soup.find_all('div', class_="tweet") for tweet in streams: mars_weather = tweet.find('p').text if 'Sol' and 'pressure' in mars_weather: print(mars_weather) break else: pass mars_dictionary["weather"] = mars_weather # In[18]: # streams = soup.find_all('div', class_="tweet") # for tweet in streams: # if (tweet['data-screen-name'] == "MarsWxReport"): # #save the tweet # mars_weather = tweet.find('p', class_="tweet-text").text # print(mars_weather) # break # else: # pass # In[19]: # mars_weather = stream.find("p", class_="tweet-text").text # mars_weather = mars_weather.rstrip("pic.twitter.com/X7ISVrTgLY") # print(mars_weather) # In[20]: url = 'https://space-facts.com/mars/' # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(response.text, 'html.parser') tables = pd.read_html(url) tables[0] facts_df = pd.DataFrame(tables[0]) #change the row headers header = pd.Series(["Type", "Value"]) facts_df.rename(columns=header, inplace=True) # facts_df.set_index('Type') html_table = facts_df.to_html() #remove new line characters html_table.replace('\n', '') # get_ipython().system('open table.html') mars_dictionary["html_table"] = html_table # In[28]: base_url = "https://astrogeology.usgs.gov" url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(response.text, 'html.parser') # In[29]: hemispheres = soup.find_all('a', class_="itemLink") print(hemispheres) # In[30]: hemisphere_image_urls = [] for item in hemispheres: # try: #find title title = item.text # print(title) #find link link = item['href'] # print(link) full_link = base_url + link # print(full_link) #go to the link to get to the page with the full image response = requests.get(full_link) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(response.text, 'html.parser') #get full image url from href in the <a> in the div class='download' high_res = soup.find('div', class_='downloads') full_href = high_res.find('a')['href'] # print("full_href: ", full_href) #put title and image URL into dictionary hemisphere_image_urls.append({"title": title, "img_url": full_href}) # except Exception as e: # print("e: ",e) hemisphere_image_urls mars_dictionary["hemispheres"] = hemisphere_image_urls print(mars_dictionary) return mars_dictionary
def scraping(): # Scrape miles from Domain and Downtown payload = { "units": "imperial", "origins": Address, "destinations": "11410 Century Oaks Terrace, Austin, TX 78758|1100 Congress Ave, Austin, TX 78701", "key": "AIzaSyCQhKXIlYN6TQ3MHT4lujpN0lXAyB1Tvyo" } response = req.get( "https://maps.googleapis.com/maps/api/distancematrix/json", params=payload).json() global miles_from_domain global miles_from_downtown miles_from_domain = response['rows'][0]['elements'][0]['distance']['text'] miles_from_downtown = response['rows'][0]['elements'][1]['distance'][ 'text'] miles_from_domain = miles_from_domain.split(' ')[0] miles_from_downtown = miles_from_downtown.split(' ')[0] # Scrape demographic data # URL of page to be scraped url_income = "http://www.energyjustice.net/justice/index.php" url_population = "https://www.freemaptools.com/find-population.htm" # retrive page with the requests module response = req.get(url_income) # create beautifulsoup object; parse with 'html.parser' executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) global median_household_income browser.visit(url_income) browser.fill('gsLocation', Address) browser.find_by_name('gsSubmit_desk').first.click() time.sleep(5) browser.find_by_id('income_layer_desk').first.click() time.sleep(5) map_soup = BeautifulSoup(browser.html, 'html.parser') income = map_soup.find_all('table')[1].find_all('td')[1].text income = income.split(' ')[0] income = income.split('$')[1] income = income.split(',')[0] + income.split(',')[1] median_household_income = float(income) global population browser.visit(url_population) time.sleep(5) browser.fill('radiusinputkm', '1.61') time.sleep(5) browser.find_by_id('tb_searchlocation').fill(Address) # browser.find_by_id('tb_searchlocation').fill('\n') time.sleep(5) browser.find_by_tag('p')[3].click() population = browser.find_by_id('div_output').text population = population.split(' ')[-1] browser.quit() # Scrape the built_year # set the chromedriver path executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) costar_url = "https://costar.com/" browser.visit(costar_url) # login browser.click_link_by_id('loginLink') browser.fill('username', 'SDong2') browser.fill('password', '719111719111') browser.click_link_by_id('loginButton') time.sleep(20) try: x_path = '//*[@id="cs-gateway-home-page"]/div[2]/div[1]/div/div/div[2]/div/div[1]/input' search_box = browser.find_by_xpath(x_path) search_box.fill(Address) search_button = browser.find_by_xpath( '//*[@id="react-autowhatever-1--item-0"]/div/span[1]') search_button.click() time.sleep(20) global built_year built_year = browser.find_by_xpath( '//*[@id="Building_YearBuilt"]/span[2]').text if len(built_year) > 4: built_year = built_year.split(' ')[1] built_year = float(built_year) else: built_year = float(built_year) except: built_year = 1997 browser.quit() # Scrape walk scores # set the chromedriver path executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) walk_score_url = 'https://www.walkscore.com/' browser.visit(walk_score_url) search_entry = browser.find_by_id('gs-street') search_entry.fill(Address) time.sleep(10) browser.find_by_css('.go-btn')[1].click() time.sleep(10) html = browser.html score_soup = BeautifulSoup(html, 'html.parser') scores = [] for i in range(1, len(score_soup.find_all('img'))): try: score_path = score_soup.find_all('img')[i]['src'] split = score_path.split('/')[-1] score = split.split('.')[0] scores.append(score) except: print("no src") break global walk_score global transit_score walk_score = scores[-3] transit_score = scores[-2] browser.quit() # redirect to the machine learning page return redirect('/ml')
def scrape(): #dependencies from bs4 import BeautifulSoup as bs import splinter import requests from splinter import Browser import time import pandas as pd from selenium import webdriver import os import pymongo import json #The dictionary mars_facts_data={} #1 #emulate the browser and get the html executable_path = {'executable_path': 'C:/chromedriver/chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) #url to visit url='https://mars.nasa.gov/news/' #we need to use the browser to visit the page because there are many elements that do not load until the page is loaded. #requests would only get the raw html. browser.visit(url) html = browser.html soup = bs(html, 'html.parser') news_p =soup.select_one("div.rollover_description_inner") news_title = soup.select_one("div.content_title") news_p = news_p.text news_title = news_title.text mars_facts_data['news_title'] = news_title mars_facts_data['news_paragraph'] = news_p #2 executable_path = {'executable_path': 'C:/chromedriver/chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) browser.click_link_by_id('full_image') time.sleep(3) browser.click_link_by_partial_text('more info') time.sleep(3) time.sleep(3) browser.click_link_by_partial_href('/spaceimages/images/') #Download the image and Store response = requests.get(browser.url) if response.status_code == 200: linkname= (browser.url.rsplit('/', 1)[-1]) SaveFile = (f'Resources/{linkname}') with open(SaveFile, 'wb') as f: f.write(response.content) print(browser.url) Space_image_dict = {} Space_image_dict['Url'] = browser.url mars_facts_data['featured_image'] = browser.url #collection.insert_one(Space_image_dict) #3 mars_weather_dict = {} url='https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'html.parser') mars_weather = soup.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text mars_weather =mars_weather.strip() mars_facts_data['weather'] = mars_weather mars_facts_data #collection.insert_one(mars_weather_dict) #4 url = 'https://space-facts.com/mars/' df = pd.read_html(url) #df = pd.DataFrame(df) df= df[0] df.columns = ['Category', 'Measure'] df.set_index('Category',inplace = True) mars_html_table = df.to_html() mars_html_table = mars_html_table.replace("\n","") mars_facts_data['mars_facts_table'] = mars_html_table return mars_facts_data
class ChopeBrowser: def __init__(self, headless=False): self.chrome = Browser('chrome', headless=headless) def time_delay(self, time): self.chrome.is_element_present_by_name('!@#$%^&*())(*&^%$#@!', wait_time=time) def login(self, usr, pwd, domain='STUDENT'): url = 'https://ntupcb.ntu.edu.sg' url += '/fbscbs/Account/SignIn?ReturnUrl=%2ffbscbs' self.chrome.visit(url) dropdown = self.chrome.find_by_tag('option') for option in dropdown: if option.text == domain: option.click() self.chrome.fill('Username', usr) self.chrome.fill('Password', pwd + '\n') # PC BOOKING STARTS HERE # Tries to book the PC of selected type def pc_setup(self, usr, pwd, Type): self.login(usr, pwd) button = self.chrome.find_by_id('tdPcBook') button.click() time.sleep(2) with self.chrome.get_iframe('frmAdminViewControls') as iframe: iframe.find_by_id('pnlInsLoc3').click() self.type_number(Type) data = self.scrape_pc() can_book = self.book_pc(data[1], data[2]) self.chrome.quit() return data[0], can_book # identify pc type requested def type_number(self, Types): for i in range(0, 4): with self.chrome.get_iframe('frmAdminViewControls') as iframe: page = iframe.find_by_id('pnlInsPcGrp' + str(i)) if page != []: page = page.html page = BeautifulSoup(page, "lxml") page = page.find("span", { "style": "display:inline-block;height:20px;width:80px;" }) page = page.get_text() if page == Types: page = iframe.find_by_id('pnlInsPcGrp' + str(i)).click() return return 0 # Scrape all PC in the current screen def scrape_pc(self): with self.chrome.get_iframe('frmSeating') as iframe: for i in range(0, 6): for j in range(1, 11): btnID = 'grdSeating_tblCol' + str(j) + '_' + str(i) parse = iframe.find_by_id(btnID) if parse == []: return 'no pc', 100, 100 if parse != []: color = self.color(parse.html) if (color == '#FFFFFF'): return self.name_pc(parse.html), j, i no_pc = 'no pc' j = 100 i = 100 return no_pc, j, i # Identify name of PC def name_pc(self, codes): soup = BeautifulSoup(codes, "lxml") mydivs = soup.findAll("span", {"class": "lblPcName"}) return mydivs[0].get_text() # Check availability of PC, by detecting background color def color(self, code): soup = BeautifulSoup(code, "lxml") tag = soup.findAll('td', {"style": "background-color: #FFFFFF"}) if tag != []: return '#FFFFFF' else: return 'blabla' # Try to book the selected PC def book_pc(self, col, row): with self.chrome.get_iframe('frmSeating') as iframe: if (col != 100) and (row != 100): try: time.sleep(1) butt = iframe.find_by_id("grdSeating_divOuterCol" + str(col) + "_" + str(row)) if butt != []: butt.click() time.sleep(1) sub = iframe.find_by_name("btnsumit") sub.click() return "booked" except: pyautogui.press('enter') return "cannot book" return "cannot book" # Initialize booking site until arriving to the booking table def first_setup(self): button = self.chrome.find_by_id('tdFacilityBook') button.click() self.chrome.click_link_by_href('#8') self.chrome.click_link_by_href('#-1') self.chrome.click_link_by_href('/fbscbs/Booking/Create?resourceId=69') self.chrome.click_link_by_id('book') self.chrome.click_link_by_id('changeResource') self.chrome.click_link_by_href('#-1') self.chrome.click_link_by_id('book') # Eliminates unnecessary booking slots def is_registered(event): if event.has_class('noShowWhite'): return False if event.has_class('currentEvent'): return False return True # Adds weekly booked slots for selected facility # Each list of weekly bookings contain list of daily bookings # each containing lists booked slots, determined by start and end time def check_facility(self, evFacilities): columnWeek = self.chrome.find_by_css('.wc-event-column') evWeek = [] for columnDay in columnWeek: evToday = [] evList = columnDay.find_by_css('.ui-corner-all') for event in evList: if not event.has_class('noShowWhite'): if not event.has_class('currentEvent'): event = event.text if not event.find('—') == -1: if event == '': continue evToday.append(event.split('—')) evWeek.append(evToday) evFacilities.append(evWeek) def click_next(self, counter, evFacilities): # Recursively check facilities. # Choose facility based on counter dropdown = self.chrome.find_by_id('ResourceId') options = dropdown.find_by_tag('option') if counter < len(options): nextOption = options[counter] nextOption.click() self.check_facility(counter, evFacilities) else: return evFacilities # Scrape seats main function # OPTIMIZE: by multithreading # and by runnnig multiple browser at once def scrape_seats(self, usr, pwd): self.login(usr, pwd) self.first_setup() evFacilities = [] dropdown = self.chrome.find_by_id('ResourceId') options = dropdown.find_by_tag('option') optRange = range(len(options)) for i in optRange: opt = options[i] nextOption = opt nextOption.click() self.time_delay(0.2) # while loadingTitle.visible: # pass evFacilities.append(opt.text) self.check_facility(evFacilities) self.quit() return evFacilities def quit(self): self.chrome.quit()
def scrape(): # Dependencies import time import requests import pandas as pd from bs4 import BeautifulSoup from splinter import Browser # from selenium.webdriver.common import action_chains, keys # from selenium import webdriver import pymongo conn = "mongodb://localhost:27017" client = pymongo.MongoClient(conn) db = client.mars_db mars_data = db.mars_data db.mars_data.drop() # having issues with browser, use webdriver instead #driver = webdriver.Chrome() #url = 'https://mars.nasa.gov/news/' #driver.get(url) #html = driver.page_source #soup = BeautifulSoup(html, 'lxml') # In[35]: browser = Browser('chrome', headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(5) # In[36]: html = browser.html soup = BeautifulSoup(html, 'lxml') print(soup.prettify()) # In[37]: # soup.body.prettify() # In[38]: # Extract news title text title = soup.find('div', class_='bottom_gradient').text print(title) # In[39]: # Extract paragraph text paragraph = soup.find('div', class_='rollover_description_inner').text print(paragraph) # ### JPL Mars Space Images - Featured Image # In[7]: # Visit the url for JPL's Featured Space Image here. # Use splinter to navigate the site and find the image url for # the current Featured Mars Image and assign the url string to a variable called featured_image_url. # Make sure to find the image url to the full size .jpg image. # Make sure to save a complete url string for this image. # # Example: # featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg' # In[11]: from splinter import Browser #img_url = 'https://www.jpl.nasa.gov/spaceimages/' #executable_path = {'executable_path': './chromedriver'} #browser = Browser('chrome', **executable_path) #browser.visit(img_url) browser = Browser('chrome', headless=False) img_url = 'https://www.jpl.nasa.gov/spaceimages/' browser.visit(img_url) time.sleep(5) # In[12]: browser.click_link_by_id('full_image') # In[13]: time.sleep(5) browser.find_link_by_partial_text('more info').click() # In[14]: #time.sleep(5) #browser.find_link_by_partial_text('.jpg').click() # In[15]: time.sleep(5) featured_image_url = browser.find_by_tag('img')[6]['src'] featured_image_url # ### Mars Weather # In[16]: # Visit the Mars Weather twitter account here # and scrape the latest Mars weather tweet from the page. # Save the tweet text for the weather report # as a variable called mars_weather. # Example: # mars_weather = \ # 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa, daylight 06:09-17:55' # In[17]: from splinter import Browser browser = Browser('chrome', headless=False) tw_acct_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(tw_acct_url) time.sleep(5) # In[18]: html = browser.html soup = BeautifulSoup(html, 'lxml') # print(soup.prettify()) # In[19]: container = soup.find('div', class_='js-tweet-text-container') container # In[20]: mars_weather = container.find( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text mars_weather # ### Mars Facts # In[21]: # Visit the Mars Facts webpage here and use Pandas # to scrape the table containing facts about the planet # including Diameter, Mass, etc. # Use Pandas to convert the data to a HTML table string. # In[22]: marsfacts_url = 'https://space-facts.com/mars/' tables = pd.read_html(marsfacts_url) tables # In[23]: df = tables[0] df # In[24]: df = df.rename(columns={0: 'Measurement', 1: 'Value'}) df = df.set_index('Measurement') df # In[25]: # convert table to html string html_table = df.to_html() html_table # In[26]: # strip unwanted newlines to clean up the table. html_table = html_table.replace('\n', '') html_table # ### Mars Hemisperes # In[27]: # Visit the USGS Astrogeology site here to obtain # high resolution images for each of Mar's hemispheres. # You will need to click each of the links to the hemispheres # in order to find the image url to the full resolution image. # Save both the image url string for the full resolution hemipshere image, # and the Hemisphere title containing the hemisphere name. # Use a Python dictionary to store the data using the keys img_url and title. # Append the dictionary with the image url string and the hemisphere title to a list. # This list will contain one dictionary for each hemisphere. # # Example: # hemisphere_image_urls = [ # {"title": "Valles Marineris Hemisphere", "img_url": "..."}, # {"title": "Cerberus Hemisphere", "img_url": "..."}, # {"title": "Schiaparelli Hemisphere", "img_url": "..."}, # {"title": "Syrtis Major Hemisphere", "img_url": "..."}, # ] # In[28]: from splinter import Browser browser = Browser('chrome', headless=False) usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(usgs_url) time.sleep(5) # In[29]: browser.find_by_css('h3')[0].click() img1_url = browser.find_by_tag('a')[41]['href'] print(img1_url) img1_title = browser.find_by_css('h2')[0].text img1_title = img1_title.replace(' Enhanced', '') print(img1_title) # In[30]: browser.back() browser.find_by_css('h3')[1].click() img2_url = browser.find_by_tag('a')[41]['href'] print(img2_url) img2_title = browser.find_by_css('h2')[0].text img2_title = img2_title.replace(' Enhanced', '') print(img2_title) # In[31]: browser.back() browser.find_by_css('h3')[2].click() img3_url = browser.find_by_tag('a')[41]['href'] print(img3_url) img3_title = browser.find_by_css('h2')[0].text img3_title = img3_title.replace(' Enhanced', '') print(img3_title) # In[32]: browser.back() browser.find_by_css('h3')[3].click() img4_url = browser.find_by_tag('a')[41]['href'] print(img4_url) img4_title = browser.find_by_css('h2')[0].text img4_title = img4_title.replace(' Enhanced', '') print(img4_title) # In[33]: # Use a Python dictionary to store the data using the keys img_url and title. hemisphere_img_dict = [ { "title": img1_title, "img_url": img1_url }, { "title": img2_title, "img_url": img2_url }, { "title": img3_title, "img_url": img3_url }, { "title": img4_title, "img_url": img4_url }, ] data_outputs = { 'title': title, 'paragraph': paragraph, 'featured_image_url': featured_image_url, 'mars_weather': mars_weather, 'html_table': html_table, 'hemisphere_img_dict': hemisphere_img_dict } mars_data.insert(data_outputs) return data_outputs
browser.find_by_id('firstheader') browser.find_by_value('query') # get element first_found = browser.find_by_name('name').first last_found = browser.find_by_name('name').last second_found = browser.find_by_name('name')[1] # Get value of an element browser.find_by_css('h1').first.value # Clicking links,return the first link browser.click_link_by_href('http://www.the_site.com/my_link') browser.click_link_by_partial_href('my_link') browser.click_link_by_text('my link') browser.click_link_by_partial_text('part of link text') browser.click_link_by_id('link_id') # element is visible or invisible browser.find_by_css('h1').first.visible #fill content browser.find_by_id('productName').fill( 'splinter - python acceptance testing for web applications') browser.fill('q', 'splinter - python acceptance testing for web applications') # Verifying if element has a className browser.find_by_css('.content').first.has_class('content') # click button browser.find_by_name('send').first.click() browser.find_link_by_text('my link').first.click()
def scrape(): url = 'https://mars.nasa.gov/news/' browser = Browser('chrome') browser.visit(url) html = browser.html soup = bs(html, 'lxml') #title of first article title = soup.find('div', class_='content_title') time.sleep(2) news_title = title.text print(news_title) #paragraph text of first article para = soup.find('div', class_='article_teaser_body') time.sleep(2) new_para = para.text print(new_para) news = {'title': news_title, 'paragraph': new_para} url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&q=Mars' browser.visit(url2) time.sleep(2) browser.click_link_by_id("full_image") time.sleep(2) html2 = browser.html soup2 = bs(html2, 'lxml') #soup2 imagediv = soup2.find('img', class_="fancybox-image")['src'] imagediv featured_image_url = 'https://www.jpl.nasa.gov' + imagediv print(featured_image_url) imageurl = {'featured_image': featured_image_url} url3 = 'https://twitter.com/marswxreport?lang=en' browser.visit(url3) html3 = browser.html soup3 = bs(html3, 'lxml') weather = soup3.find( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text') mars_weather = weather.text print(mars_weather) marsweather = {'weather': mars_weather} url4 = 'https://space-facts.com/mars/' browser.visit(url4) html4 = browser.html soup4 = bs(html4, 'lxml') tables = pd.read_html(html4) info_table = tables[0] mars_info = {'table': info_table.to_html()} def retrieve_hemis(): # URL for USGS Astrogeology url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' # Visit URL and parse html browser.visit(url5) html5 = browser.html soup5 = bs(html5, 'lxml') # find the articles articles = soup5.find_all('div', class_='description')[0:4] # create list object to store output imgs = [] # iterate over articles for article in articles: img = {} href = article.h3.text browser.click_link_by_partial_text(href) html5 = browser.html soup5 = bs(html5, 'lxml') img['title'] = href img['img_url'] = soup5.find('a', target='_blank')['href'] imgs.append(img) #restart process browser.visit(url5) return (imgs) imgs = retrieve_hemis() print(imgs) db.news.insert(news) db.imageurl.insert(imageurl) db.marsweather.insert(marsweather) db.mars_hemispheres.insert_many(imgs) db.mars_info.insert(mars_info) #scrape()
def scrape_all(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(2) html = browser.html soup = BeautifulSoup(html,'html.parser') article = soup.find('div',class_="list_text") try: title = article.find('div',class_='content_title').text.strip() except: title = '[No information returned. Click the button again.]' try: para = article.find('div',class_='article_teaser_body').text.strip() except: para = '[No information returned. Click the button again.]' url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) browser.click_link_by_id('full_image') html = browser.html soup = BeautifulSoup(html,'html.parser') featured_image_url = soup.article.a['data-fancybox-href'] featured_image_url = (f'https://www.jpl.nasa.gov{featured_image_url}') descrip =soup.h1.text.strip() url = 'https://space-facts.com/mars/' browser.visit(url) tables = pd.read_html(url) df = tables[0] df.columns = ['', 'Mars'] df = df.set_index('') mars_facts= df.to_html(classes='table') url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Click the link for large image, find title and link, add try and except, store in dict format, append to dict hemisphere_image_urls_dict = [] for i in range(4): browser.find_by_css("a.product-item h3")[i].click() hemi_soup = BeautifulSoup(browser.html, "html.parser") try: title_elem = hemi_soup.find("h2", class_="title").get_text() sample_elem = hemi_soup.find("a", text="Sample").get("href") except AttributeError: title_elem = None sample_elem = None hemispheres = { "title": title_elem, "img_url": sample_elem } # Append hemisphere info hemisphere_image_urls_dict.append(hemispheres) # Finally, we navigate backwards browser.back() browser.quit() data = { 'news_title':title, 'news_paragraph':para, 'featured_image': featured_image_url, 'featured_image_description':descrip, 'hemisphere_image_urls': hemisphere_image_urls_dict, 'facts':mars_facts, 'last_modified':dt.datetime.now() } return(data)
def scrape_all(): executable_path = {"executable_path": "chromedriver.exe"} browser = Browser('chrome', **executable_path, headless=True) url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" browser.visit(url) time.sleep(3) page = browser.html soup = bs(page, 'html.parser') #Title text and description results = soup.find('div', class_='image_and_description_container') title = results.find('div', class_='content_title') #Returns title_text = title.a.text description = results.find('div', class_='article_teaser_body').text #Image scraping #Setup #Browser navigation url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) time.sleep(2) browser.click_link_by_id('full_image') time.sleep(2) browser.click_link_by_partial_text('more info') #Large Image HTML page = browser.html soup = bs(page, 'html.parser') #Store large Image URL results = soup.find('img', class_='main_image') image_link = results['src'] #return featured_image_url = ("https://www.jpl.nasa.gov" + image_link) #Table scraping url = "https://space-facts.com/mars/" tables = pd.read_html(url) mars_facts = tables[0] mars_facts.columns = ['Facts', 'Mars'] mars_facts.set_index('Facts', inplace=True) #return fact_table = mars_facts.to_html(classes="table table-striped") #Mars Hemispheres #browser navigation url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) time.sleep(2) #page html page = browser.html soup = bs(page, 'html.parser') results = soup.find_all('a', class_='description') results = soup.find_all('div', class_='description') hemispheres = [] for result in results: url = result.a['href'] url_full = ("https://astrogeology.usgs.gov" + url) browser.visit(url_full) time.sleep(2) url_page = browser.html url_soup = bs(url_page, 'html.parser') url_results = url_soup.find('img', class_="wide-image")['src'] img_url = ("https://astrogeology.usgs.gov" + url_results) title = url_soup.find('h2', class_='title').text hem_dic = { "title": title, "img_url": img_url } hemispheres.append(hem_dic) data = { "latest_title": title_text, "latest_description": description, "featured_image": featured_image_url, "mars_fact_table": fact_table, "hemispheres": hemispheres } browser.quit() return data
def scrape_mars(): # Defines the path to the chrome driver and create a browser object executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) # Defines url of site to be scraped and navigates to it url = 'https://mars.nasa.gov/news/' browser.visit(url) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'lxml') #print(soup.prettify()) time.sleep(5) # Scrapes the first news headline and description and save to variable news_title = soup.find('div', class_="bottom_gradient").h3.text news_p = soup.find('div', class_="article_teaser_body").text ### ### # ------------------------------------------------------------------------- # ### ### # executable_path = {'executable_path': 'chromedriver.exe'} # browser = Browser('chrome', **executable_path, headless=True) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # HTML object html = browser.html soup = BeautifulSoup(html, 'html.parser') # Click the full image button image_class = soup.find('a', class_="button fancybox") full_image_click = image_class.get("id") browser.click_link_by_id(full_image_click) browser.is_element_present_by_id("fancybox-lock", wait_time=10) time.sleep(10) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') after_full_image_click = soup.body.prettify() # Clicks the more info button b = soup.body.find('div', class_="buttons") lin = b.find_all('a') more_in = lin[1].get('href') browser.links.find_by_partial_href(more_in).click() # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') # Scraping partial url for featured image and saving to variable. Then appending the partial url to a base url for a full url to the featured image. base_url = "jpl.nasa.gov" im_page = soup.select_one("figure.lede a img") im = im_page.get("src") im_url = base_url + im ### ### # ------------------------------------------------------------------------- # ### ### url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) time.sleep(5) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') mars_weather = soup.find( 'div', class_= "css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0" ).text ### ### # ------------------------------------------------------------------------- # ### ### url = 'https://space-facts.com/mars/' browser.visit(url) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') tables = soup.find_all('td') df = pd.read_html(url) mars_facts = df[0] mars_facts_table = mars_facts.to_html() #mars_facts_table = mars_facts_table.replace('\n', '') #pprint.pprint(mars_facts_table) ### ### # ------------------------------------------------------------------------- # ### ### url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') hemispheres = [] for i in range(4): browser.find_by_css('a.product-item h3')[i].click() # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') title = soup.find('h2', class_="title").get_text() link = soup.find("a", text="Sample").get("href") hemisphere = {"title": title, "link": link} hemispheres.append(hemisphere) browser.back() ### ### # ------------------------------------------------------------------------- # ### ### browser.quit() mars_dict = { "feat_im": im_url, "news_title": news_title, "news_desc": news_p, "weather": mars_weather, "facts": mars_facts_table, "hemishperes": hemispheres } print(mars_dict)