def scrape_mars_weather(): try: browser = initialize_browser() weather_url = "https://twitter.com/marswxreport?lang=en" browser.visit(weather_url) html_weather = browser.html soup = bs(html_weather, "html.parser") recent_tweets = soup.find_all("div", class_="js-tweet-text-container") for tweet in recent_tweets: weather_tweet = tweet.find("p").text if "Sol" and "pressure" in weather_tweet: print(weather_tweet) break else: pass mars_information["weather_tweet"] = weather_tweet return mars_information finally: browser.quit()
def test_webdriverchrome_should_be_deprecated(self): with warnings.catch_warnings(record=True) as warnings_list: warnings.simplefilter('default') from splinter.browser import Browser browser = Browser('webdriver.chrome') browser.quit() warning_message = warnings_list[0].message.args[0] self.assertEquals("'webdriver.chrome' is deprecated, use just 'chrome'", warning_message)
def scrape_mars_facts(): try: browser = initialize_browser facts_url = "https://space-facts.com/mars/" table = pd.read_html(facts_url) mars_facts_df = table[0] mars_facts_df.columns = ["Measurement", "Value"] mars_facts_df.set_index("Measurement", inplace=True) data = mars_facts_df.to_html() mars_information["table"] = data return mars_information finally: browser.quit()
def scrape_all(): # Initiate headless driver for deployment executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=True) news_title, news_paragraph = mars_news(browser) # Run all scraping functions and store results in dictionary data = { "news_title": news_title, "news_paragraph": news_paragraph, "featured_image": featured_image(browser), "facts": mars_facts(), "last_modified": dt.datetime.now(), "hemispheres": hemispheres(browser) } # Stop webdriver and return data browser.quit() return data
def scrape_mars_news(): try: browser = initialize_browser() url = "https://mars.nasa.gov/news/" browser.visit(url) html = browser.html soup = bs(html, "html.parser") news_headline = soup.find("div", class_="content_title").find("a").text news_story = soup.find("div", class_="article_teaser_body").text mars_information["news_headline"] = news_headline mars_information["news_story"] = news_story return mars_information finally: browser.quit()
def scrape_all(): # Initiate headless driver for deployment (initialize the browser) browser = Browser("chrome", executable_path="chromedriver", headless=False) # set news title and paragraph variables news_title, news_paragraph = mars_news(browser) # Run all scraping functions and store results in a dictionary (create data dictionary) data = { "news_title": news_title, "news_paragraph": news_paragraph, "featured_image": featured_image(browser), "facts": mars_facts(), "weather": mars_weather(browser), "hemisphere_title": title, "hemispheres": hemisphere_image_urls, "last_modified": dt.datetime.now() } # Stop webdriver and return scraped data browser.quit() return data
def scrape_mars_image(): try: browser = initialize_browser() url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_image) html_image = browser.html soup = bs(html_image, "html.parser") featured_image_url = soup.find("article")["style"].replace( "background-image: url(", "").replace(");", "")[1:-1] main_url = "https://www.jpl.nasa.gov" featured_image_url = main_url + featured_image_url featured_image_url mars_information["featured_image_url"] = featured_image_url return mars_information finally: browser.quit()
def scrape(): dict_data = {} browser = init_browser() url = "https://mars.nasa.gov/news/" jpl_url = "https://www.jpl.nasa.gov/images?search=&category=Mars" mars_url = "https://space-facts.com/mars/" hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') titles_body_soup = soup.find_all("div", class_=("content_title", "article_teaser_body")) #display(len(titles_body_soup)) title_list = [] news_list = [] for i, x in enumerate(titles_body_soup): if i % 2 == 0: news_list.append(x.text) else: title_list.append(x.text) #display(len(title_list)) #display(len(news_list)) #news list has extra line due to div content_title from nav bar element news_list = news_list[1:49] #display(len(news_list)) browser.visit(jpl_url) time.sleep(.5) browser.find_by_css("img.BaseImage").click() browser.find_by_css("svg.IconExpand").click() jpl_html = browser.html soup0 = BeautifulSoup(jpl_html, "html.parser") featured_image_jpg = soup0.find_all( "div", class_="BaseLightbox__slide__img")[0]("img")[0]["src"] browser.visit(mars_url) mars_table = pd.read_html(mars_url) planet_comparison_df = mars_table[1].set_index("Mars - Earth Comparison") #display(planet_comparison_df) mars_facts = mars_table[0].rename(columns=({ 0: "Description", 1: "Mars" })).set_index("Description") #display(mars_facts) mars_html = mars_facts.to_html() hemisphere_dict_list = [] hemisphere_images_urls = {} browser.visit(hemi_url) time.sleep(.5) for x in range(4): browser.find_by_css("img.thumb")[x].click() browser.find_by_css("a.open-toggle").click() large_hemi_html = browser.html hemi_soup = BeautifulSoup(large_hemi_html, "html.parser") title = hemi_soup("h2", class_="title")[0].text hemisphere_images_urls["title"] = title.replace(" Enhanced", "") hemisphere_images_urls["img_url"] = hemi_soup( "img", class_="wide-image")[0]["src"] hemisphere_dict_list.append(hemisphere_images_urls) browser.visit(hemi_url) hemisphere_images_urls = {} hemisphere_dict_list browser.quit() dict_data["article_title"] = title_list[0] dict_data["news_list"] = news_list[0] dict_data["featured_image"] = featured_image_jpg dict_data["mars_table"] = mars_html dict_data["hemisphere_dict_list"] = hemisphere_dict_list from pymongo import MongoClient mongo_conn = MongoClient('mongodb://localhost:27017') mars_db = mongo_conn["mars_db"] mars_coll = mars_db["mars"] mars_db.mars_coll.insert_one(dict_data) return dict_data
# hemispheres = {hemisphere_image_urls} #mars_data['hemisphere_image_urls'] = hemisphere_image_urls #return mars_data # 4. Print the list that holds the dictionary of each image url and title. hemisphere_image_urls # Mongodb Helper Function def scrape_hemisphere(html_text): hemisphere_soup = soup(html_text, "html.parser") try: title_element = hemisphere_soup.find("h2", class_="title").get_text() sample_element = hemisphere_soup.find("a", text="Sample").get("href") except AttributeError: title_element = None sample_element = None hemisphere = {"title": title_element, "img_url": sample_element} return hemisphere # 5. Quit the browser browser.quit() # Tell Flask the script is complete and ready. if __name__ == "__main__": # If running as script, print scraped data print(scrape_all())
def scrape(): browser = init_browser() # Mars News # assign url; use browser to 'get' url url_news = "https://mars.nasa.gov/news/" browser.visit(url_news) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Use .find to locate the "div" and "class" and return only text news_title = soup.find('div', class_='content_title') news_p = soup.find('div', class_='article_teaser_body') # print text to confirm print(news_title.text) print(news_p.text) # Mars image # assign url; use browser to 'get' url url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_image) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Use splinter to navigate the site and find the image url for the current Featured Mars Image image = soup.find('img', class_='fancybox-image') footer = soup.find("footer") link = footer.find('a') # assign the url string to a variable called `featured_image_url`. featured_image_url = link['data-fancybox-href'] # save a complete url string for this image print('https://www.jpl.nasa.gov/' + featured_image_url) # Mars weather # assign url; use browser to 'get' url url_mars_weather = "https://twitter.com/marswxreport?lang=en" browser.visit(url_mars_weather) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # using regular expressions .compile function mars_weather = re.compile(r'sol') mars_weather = soup.find('span', text=mars_weather).text print(mars_weather) # Mars facts # assign url; use browser to 'get' url url_facts = "https://space-facts.com/mars/" browser.visit(url_facts) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # making a get request response = requests.get(url_facts) # scrape the table data from 'table' and 'id' elements mars_facts = soup.find('table', id="tablepress-p-mars-no-2").text print(mars_facts) # Mars table data table = pd.read_html(url_facts) table df = table[0] html_table = df.to_html() html_table df.to_html('table.html') # Mars Hemisphere # assign url; use browser to 'get' url url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_hemisphere) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # variable to locate the 'div' with 'item' from url items = soup.find_all('div', class_='item') # create an empty list to store results url_hemisphere_img = [] # set variable to visit main url main_url = 'https://astrogeology.usgs.gov' # create a for loop for i in items: # locate titles title = i.find('h3').text # locate first partial img_url partial_img_url = i.find('a', class_='itemLink product-item')['href'] # return to main_url; then partial img url browser.visit(main_url + partial_img_url) # initiate new html browser partial_img_html = browser.html # use beautiful soup and splinter to scrape each page soup = BeautifulSoup(partial_img_html, 'html.parser') # set variable to find full 'img' & 'src' urls img_url = main_url + soup.find('img', class_='wide-image')['src'] # append titles and imgs; return as a list of dictionaries url_hemisphere_img.append({"title": title, "img_url": img_url}) url_hemisphere_img # Close the browser after scraping browser.quit() # Store data in a dictionary mars_info = { "News Title": news_title, "News Paragraph": news_p, "Featured Image": featured_image_url, "Mars Weather": mars_weather, "Mars Facts": mars_facts, "Mars Table": table, "Mars Hemisphere": url_hemisphere_img } # Return results return mars_info
def scrape_all(): browser = init_browser() browser.visit('https://mars.nasa.gov/news/') html = browser.html news_soup = BeautifulSoup(html, 'lxml') title = news_soup.find_all('div', class_='content_title') #place results in designated variables to be used later news_title = title[1].text.strip() print(news_title) parag = news_soup.find_all('div', class_='article_teaser_body') news_p = parag print(news_p) # JPL Mars Space Images - Featured Image browser.visit( "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars") time.sleep(3) browser.click_link_by_partial_text('FULL IMAGE') browser.click_link_by_partial_text('more info') feat_html = browser.html feat_soup = BeautifulSoup(feat_html, 'html.parser') mars_img_url = feat_soup.find('figure', class_='lede').a['href'] orig_url = "https://www.jpl.nasa.gov" featured_image_url = orig_url + mars_img_url print(f"{featured_image_url}") time.sleep(2) # Mars Facts mars_facts_url = 'https://space-facts.com/mars/' time.sleep(3) tables_found = pd.read_html(mars_facts_url) mars_facts_df = tables_found[0] mars_facts_df.head() #mars_html_table = mars_facts_df.to_html(classes='data table', index=False, header=False, border=0) mars_html_table = mars_facts_df.to_html() print(mars_html_table) # Mars Hemispheres #browser = Browser('chrome', **executable_path, headless=False) #hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit( "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" ) hemis_html = browser.html hemis_soup = BeautifulSoup(hemis_html, 'html.parser') hemis_orig_url = 'https://astrogeology.usgs.gov' hemisphere_urls = [] hemis_items = hemis_soup.find_all('div', class_='item') # FOR loop to process titles and urls in a dictionary for item in hemis_items: title = item.find('h3').text partial_img_url = item.find('a', class_='itemLink product-item')['href'] browser.visit(hemis_orig_url + partial_img_url) prev_html = browser.html hemis_soup = BeautifulSoup(prev_html, 'html.parser') img_url = hemis_orig_url + hemis_soup.find('img', class_='wide-image')['src'] hemisphere_urls.append({"title": title, "img_url": img_url}) #print(f"{hemisphere_urls[item]}") # save all the compiled data about mars in a dictionary mars_dictionary = { "latest_news_title": news_title, "latest_news_parag": news_p, "JPL_featured_image": featured_image_url, "mars_facts_table": mars_html_table, "hemisphere_images": hemisphere_urls } #for debugging only # print("this is my mars dictionary") # print(f"[latest_news_title]") # print(f"[latest_news_parag]") # print(f"[JPL_featured_image]") # print(f"[mars_facts_table]") # print(f"[hemisphere_images]") # close browser browser.quit() return mars_dictionary