def mars_hs(browser): url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' webpage = aux.getParsedWebpage(browser, url) # get all unique links to the photo pages first page_links_list = [] page_links = soup.find_all(webpage, 'a', class_='itemLink product-item') [page_links_list.append(page.get('href')) for page in page_links] page_links_list = list(set(page_links_list)) image_list = [] # iterate through links and pull URL for full size images for link in page_links_list: url = f'https://astrogeology.usgs.gov{link}' webpage = aux.getParsedWebpage(browser, url) # get image title title = soup.find(webpage, 'h2', class_='title').get_text() # get full size image link downloads_section = soup.find(webpage, 'div', class_='downloads') image_link = soup.find(downloads_section, 'a').get('href') # add title and full-size image url to dict image_list.append({'title':title, 'image_url':image_link}) return image_list
def jpl_image(browser): # Scrape the URL and return url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' webpage = aux.getParsedWebpage(browser, url) # get and construct url for largest size of featured image available featured_url = soup.find(webpage, 'a', class_='button fancybox').get('data-fancybox-href') featured_url = "https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA11591_hires.jpg" return featured_url
def mars_weather_tweet(browser): url = 'https://twitter.com/marswxreport?lang=en' webpage = aux.getParsedWebpage(browser, url) std_tweet_class = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' # pull text of most recent tweet about the weather first_tweet = soup.find_all(webpage, 'p', class_= std_tweet_class)[0].get_text() return first_tweet
def mars_news2(browser): url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' webpage = aux.getParsedWebpage(browser, url) # pull the most recent headlines + info from the website text_grouped = soup.find_all(webpage, 'div', class_='article_teaser_body') # Scrape the first article title and teaser paragraph text; return them first_paragraph = aux.getParsedTextList(text_grouped)[0] return first_paragraph
def mars_facts(browser): url = 'https://space-facts.com/mars/' webpage = aux.getParsedWebpage(browser, url) # create dict to hold facts fact_dict = {} # get all rows in the facts table and parse into dict facts_all = soup.find(webpage, 'table', class_='tablepress tablepress-id-p-mars').find_all('tr') for fact in facts_all: fact_dict[soup.find(fact, 'strong').get_text()] = (soup.find(fact, class_='column-2').get_text()) # convert to Dataframe and to HTML table fact_df = pd.DataFrame.from_dict(fact_dict, orient='index') fact_df.rename(columns={0:'Facts about Mars'}, inplace=True) fact_html = pd.DataFrame.to_html(fact_df) return fact_html
def scrape(): # Import dependencies from bs4 import BeautifulSoup as soup from splinter import Browser import pandas as pd import aux_func as aux # initialize splinter browser browser = Browser('chrome', **{"executable_path": "/usr/local/bin/chromedriver"}, headless=False) # Scrape NASA Mars site for headlines, dates, and content preview url = 'https://mars.nasa.gov/news/' webpage = aux.getParsedWebpage(browser, url) # Scrapes most recent headlines, date, and text in order headlines_grouped = soup.find_all(webpage, 'h3', class_=None) dates_grouped = soup.find_all(webpage, 'div', class_='list_date') text_grouped = soup.find_all(webpage, 'div', class_='article_teaser_body') # Iterates and generates list of all items zip_headlines = list( zip(aux.getParsedTextList(headlines_grouped), aux.getParsedTextList(dates_grouped), aux.getParsedTextList(text_grouped))) # Generate Dataframe from raw data headline_df = pd.DataFrame(zip_headlines) headline_df.rename(columns={ 0: 'headline', 1: 'date', 2: 'text' }, inplace=True) headline_df # Scrape JPL Mars website for most recent image # For some reason this pulls an image of Neptune url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' webpage = aux.getParsedWebpage(browser, url) # Get title and description featured_title = soup.find(webpage, 'h1', class_='media_feature_title').get_text() featured_description = soup.find( webpage, 'a', class_='button fancybox').get('data-description') # featured_url = soup.find( webpage, 'a', class_='button fancybox').get('data-fancybox-href') featured_url = f'https://www.jpl.nasa.gov{featured_url}' print(featured_title) print(featured_description) print(featured_url) #Mars Twitter Parse #Hardcoded a single tweet for now for testing (not all tweets are weather reports, causes errors) url = 'https://twitter.com/MarsWxReport/status/1038219633726316544' webpage = aux.getParsedWebpage(browser, url) #std_tweet_class = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' std_tweet_class = 'TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text' # pull text of most recent tweet about the weather mars_weather = soup.find_all(webpage, 'p', class_=std_tweet_class)[0].get_text() # create split string to pull apart and add to dataframe mars_weather_split = recent_weather.split(',') # create dictionary to turn into a dataframe weather_dict = { 'mars_date': f'{mars_weather.split("(")[0]}', 'earth_date': f'{mars_weather.split("(")[1].split(")")[0]}', 'temp_high': f'{mars_weather_split[1].split(" ")[2]}', 'temp_low': f'{mars_weather_split[2].split(" ")[2]}', 'pressure': f'{mars_weather_split[3].split(" ")[3]}', 'daylight': f'{mars_weather_split[4].split(" ")[2]}' } weather_df = pd.DataFrame.from_dict(weather_dict, orient='index') weather_df = weather_df.rename(columns={0: 'Most Recent Weather on Mars'}) weather_df # Mars Facts url = 'https://space-facts.com/mars/' webpage = aux.getParsedWebpage(browser, url) # dictionary to hold facts facts_dict = {} # get rows in facts table, parse into dictionary facts_all = soup.find( webpage, 'table', class_='tablepress tablepress-id-mars').find_all('tr') for fact in facts_all: facts_dict[soup.find(fact, 'strong').get_text()] = (soup.find( fact, class_='column-2').get_text()) # convert fact_dict to DF and HTML facts_df = pd.DataFrame.from_dict(facts_dict, orient='index') facts_df.rename(columns={0: 'Facts about Mars'}, inplace=True) facts_html = pd.DataFrame.to_html(facts_df) facts_df # Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' webpage = aux.getParsedWebpage(browser, url) base_url = 'https://astrogeology.usgs.gov/' # get all unique links to photo pages page_links_list = [] page_links = soup.find_all(webpage, 'a', class_='itemLink product-item') [page_links_list.append(page.get('href')) for page in page_links] page_links_list = list(set(page_links_list)) image_list = [] # iterate through links and pull image URLs for link in page_links_list: url = f'https://astrogeology.usgs.gov{link}' webpage = aux.getParsedWebpage(browser, url) # get image title title = soup.find(webpage, 'h2', class_='title').get_text() # get full size image link downloads_section = soup.find(webpage, 'div', class_='downloads') image_link = soup.find(downloads_section, 'a').get('href') # add title and full-size image url to dict image_list.append({'title': title, 'image_url': image_link}) # create DF image_df = pd.DataFrame(image_list, columns=['title', 'image_url']) image_df # Return dictionary scrape_dict = { 'headlines': zipped_headlines, 'featured_img_url': featured_url, 'weather': weather_html, 'facts': fact_html, 'image_urls': image_list } return scrape_results_dict
def scrape(): """ Returns a dictionary with images and data on Mars. Contains the following information in the following formats: - Recent headlines (List) - Featured image URL (string) - Most recent weather (HTML Table) - Facts (HTML Table) - Image URLs for Hemisphere Pictures (List of Dict) """ # ------------------------------------------------------------ # Step 1: Import all required modules and initialize all tools # ------------------------------------------------------------ from bs4 import BeautifulSoup as soup from splinter import Browser import pandas as pd import aux_func as aux # initialize splinter browser browser = Browser('chrome', **{"executable_path": "/usr/local/bin/chromedriver"}, headless=False) # ------------------------------------------------------------ # Step 2: Scrape Nasa Mars News website for recent headlines # with headlines, dates, and content preview # ------------------------------------------------------------ url = aux.hidden_urls[0] try: webpage = aux.getParsedWebpage(browser, url) # pull the most recent headlines + info from the website headlines = soup.find_all(webpage, 'h3', class_=None) dates = soup.find_all(webpage, 'div', class_='list_date') text = soup.find_all(webpage, 'div', class_='article_teaser_body') # iterate through and generate lists of all individual items zipped_headlines = list( zip(aux.getParsedTextList(headlines), aux.getParsedTextList(dates), aux.getParsedTextList(text))) # generate a readable dataframe headline_df = pd.DataFrame(zipped_headlines) headline_df.rename(columns={ 0: 'headline', 1: 'date', 2: 'text' }, inplace=True) headline_html = pd.DataFrame.to_html(headline_df) except: print(error) # ------------------------------------------------------------ # Step 3: Scrape Nasa Mars News website for featured image # ------------------------------------------------------------ try: url = aux.hidden_urls[1] webpage = aux.getParsedWebpage(browser, url) # get title and description featured_title = soup.find(webpage, 'h1', class_='media_feature_title').get_text() featured_description = soup.find( webpage, 'a', class_='button fancybox').get('data-description') # get and construct url for largest size of featured image available featured_url = soup.find( webpage, 'a', class_='button fancybox').get('data-fancybox-href') featured_filename = featured_url.split('/')[4].split('_')[0] featured_url = f'https://www.jpl.nasa.gov/spaceimages/images/largesize/{featured_filename}_hires.jpg' except: print(error) # ------------------------------------------------------------ # Step 4: Scrape Mars Twitter for the most recent weather # update # ------------------------------------------------------------ try: url = aux.hidden_urls[2] webpage = aux.getParsedWebpage(browser, url) std_tweet_class = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' # pull text of most recent tweet about the weather recent_weather = soup.find_all(webpage, 'p', class_=std_tweet_class)[0].get_text() # create split string to pull apart and add to dataframe recent_weather_split = recent_weather.split(',') recent_weather_split = [i.split(' ') for i in recent_weather_split][2:] # create dictionary to turn into a dataframe weather_dict = { 'mars_date': f'{recent_weather.split("(")[0]}', 'earth_date': f'{recent_weather.split("(")[1].split(")")[0]}', 'cur_weather': f'{recent_weather_split[0][1]}', 'temp_high': f'{recent_weather_split[1][2]}', 'temp_low': f'{recent_weather_split[2][2]}', 'pressure': f'{recent_weather_split[3][3]} {recent_weather_split[3][4]}', 'daylight': f'{recent_weather_split[4][2]}' } weather_df = pd.DataFrame.from_dict(weather_dict, orient='index') weather_df = weather_df.rename( columns={0: 'Most Recent Weather on Mars'}) weather_html = pd.DataFrame.to_html(weather_df) except: print(error) # ------------------------------------------------------------ # Step 5: Scrape Space Facts website for data on Mars # ------------------------------------------------------------ try: url = aux.hidden_urls[3] webpage = aux.getParsedWebpage(browser, url) # create dict to hold facts fact_dict = {} # get all rows in the facts table and parse into dict facts_all = soup.find( webpage, 'table', class_='tablepress tablepress-id-mars').find_all('tr') for fact in facts_all: fact_dict[soup.find(fact, 'strong').get_text()] = (soup.find( fact, class_='column-2').get_text()) # convert to Dataframe and to HTML table fact_df = pd.DataFrame.from_dict(fact_dict, orient='index') fact_df.rename(columns={0: 'Facts about Mars'}, inplace=True) fact_html = pd.DataFrame.to_html(fact_df) except: print(error) # ------------------------------------------------------------ # Step 6: Scrape images and titles from Astrogeology site # ------------------------------------------------------------ try: url = aux.hidden_urls[4] webpage = aux.getParsedWebpage(browser, url) # store the base link for the page base_url = 'https://astrogeology.usgs.gov/' # get all unique links to the photo pages first page_links_list = [] page_links = soup.find_all(webpage, 'a', class_='itemLink product-item') [page_links_list.append(page.get('href')) for page in page_links] page_links_list = list(set(page_links_list)) image_list = [] # iterate through links and pull URL for full size images for link in page_links_list: url = f'https://astrogeology.usgs.gov{link}' webpage = aux.getParsedWebpage(browser, url) # get image title title = soup.find(webpage, 'h2', class_='title').get_text() # get full size image link downloads_section = soup.find(webpage, 'div', class_='downloads') image_link = soup.find(downloads_section, 'a').get('href') # add title and full-size image url to dict image_list.append({'title': title, 'image_url': image_link}) except: print(error) # ------------------------------------------------------------ # Step 7: Stick everything into a dictionary and return it # ------------------------------------------------------------ scrape_results_dict = { 'headlines': zipped_headlines, 'featured_img_url': featured_url, 'weather': weather_html, 'facts': fact_html, 'image_urls': image_list } return scrape_results_dict