def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) marsdata = {} url = 'https://mars.nasa.gov/news/' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # Print Title news_title = soup.title.text # Print all paragraph texts paragraphs = soup.find_all('p') for paragraph in paragraphs: print(paragraph.text) news_p = paragraph.text # add our last news and last paraghraph to to Marse_data marsdata["news_title"] = news_title marsdata["news_p"] = news_p #Mars Image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) full_image = browser.find_by_id('full_image') full_image.click() browser.is_element_present_by_text('more info', wait_time=3) info = browser.find_link_by_partial_text('more info') info.click() html = browser.html img = BeautifulSoup(html, 'html.parser') marsdata["featured_image_url"] = img.select_one('figure.lede a img').get( 'src') # Mars Weather mars_weather = soup.find(string=re.compile("Sol")) marsdata["mars_weather"] = mars_weather # Space Facts url = 'https://space-facts.com/mars/' browser.visit(url) for i in browser.find_by_tag('td'): i.text head = [] data = [] for r in range(len(browser.find_by_tag('td'))): if r % 2 == 0: head.append(browser.find_by_tag('td')[r].text) else: data.append(browser.find_by_tag('td')[r].text) mars_facts = list(zip(head, data)) marsdata["Mars_facts"] = mars_facts # Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) links = ['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles'] hemisphere_image_urls = [] links = ['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles'] hemisphere_image_urls = [] for link in links: hemisphere_image_urls_dic = {} link_click = browser.find_link_by_partial_text(link) link_click.click() time.sleep(15) # ` browser.is_element_present_by_css("img.wide-image", wait_time=10) html = browser.html soup = BeautifulSoup(html, 'html.parser') image_url = soup.find("img", class_="wide-image")["src"] title = soup.find("h2", class_="title").text if "https://astrogeology.usgs.gov:" not in image_url: image_url = "https://astrogeology.usgs.gov" + image_url hemisphere_image_urls_dic['title'] = title hemisphere_image_urls_dic['image_url'] = image_url hemisphere_image_urls.append(hemisphere_image_urls_dic) browser.back() marsdata["hemisphere_title_urls"] = hemisphere_image_urls browser.quit() return marsdata
# Import Splinter, BeautifulSoup, and Pandas from splinter import Browser from bs4 import BeautifulSoup as soup import pandas as pd # Path to chromedriver !which chromedriver # Set the executable path and initialize the chrome browser in splinter executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path) # Visit the mars nasa news site url = 'https://mars.nasa.gov/news/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Convert the browser html to a soup object and then quit the browser html = browser.html news_soup = soup(html, 'html.parser') slide_elem = news_soup.select_one('ul.item_list li.slide') slide_elem.find("div", class_='content_title') # Use the parent element to find the first a tag and save it as `news_title` news_title = slide_elem.find("div", class_='content_title').get_text() news_title
def init_browser(): executable_path = {'executable_path': 'chromedriver.exe'} return Browser("chrome", **executable_path, headless=False)
def scrape(driver, driverpath): # set up spliter browser executable_path = {"executable_path": driverpath} ## Latest news # set up splinter browser with Browser(driver, **executable_path, headless=False) as browser: # visit url url = "https://mars.nasa.gov/news/" browser.visit(url) time.sleep(T) # pull html text html = browser.html # parse html soup = BeautifulSoup(html, "html.parser") # grab news title news_title = soup.find("div", {"class": "bottom_gradient"}).text # grab news content news_content = soup.find("div", { "class": "rollover_description_inner" }).text # with Browser("chrome", **executable_path, headless=False) as browser: # Latest featured images # featured image url url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) time.sleep(T) # navigate to link browser.click_link_by_id("full_image") # browser.click_link_by_partial_text("FULL IMAGE") time.sleep(T) browser.click_link_by_partial_text("more info") time.sleep(T) # pull/off-load html text html = browser.html # parse html soup = BeautifulSoup(html, "html.parser") # grab the image path image_path = soup.find('figure', class_='lede').a['href'] # make the full path featured_image_url = "https://www.jpl.nasa.gov/" + image_path # # grab the image path # image_path = soup.find("div", {"class": "download_tiff"}).p.a["href"] # # make the full path # featured_image_url = "https://www.jpl.nasa.gov/" + image_path # with Browser("chrome", **executable_path, headless=False) as browser: ## Latest weather url = "https://twitter.com/marswxreport?lang=en" browser.visit(url) # pull/off-load html text html = browser.html # parse html soup = BeautifulSoup(html, "html.parser") # grab latest tweet weather = soup.find( "p", { "class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" }).text # with Browser("chrome", **executable_path, headless=False) as browser: ## Mars facts url = "https://space-facts.com/mars/" browser.visit(url) # pull/off-load html text html = browser.html # parse html soup = BeautifulSoup(html, "html.parser") #get the entire table facts_table = soup.find('table', { "class": "tablepress tablepress-id-mars" }).find_all("tr") facts_dict = dict(label=[], value=[]) for tr in facts_table: elements = tr.find_all("td") facts_dict["label"].append(elements[0].text) facts_dict["value"].append(elements[1].text) facts_df = pd.DataFrame(facts_dict) facts_html = facts_df.to_html(header=False, index=False) # with Browser("chrome", **executable_path, headless=False) as browser: ## Mars hemispheres url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) time.sleep(T) # pull/off-load html text html = browser.html # parse html soup = BeautifulSoup(html, "html.parser") # get class holding hemisphere picture collapsible_results = soup.find("div", {"class": "collapsible results"}) hemispheres = collapsible_results.find_all("div", {"class": "description"}) hemisphere_image_urls = [] for item in hemispheres: # get title title = item.a.h3.text # get link to1 hemisphere page url_item = "https://astrogeology.usgs.gov" + item.a['href'] # pull/off-load heml text browser.visit(url_item) time.sleep(T) # off-load html text html_item = browser.html # parse html soup_item = BeautifulSoup(html_item, 'html.parser') image_url = soup_item.find('div', { "class": "downloads" }).find('li').a['href'] hemisphere_image_urls.append(dict(title=title, url=image_url)) # check on the retrieved link browser.visit(image_url) time.sleep(T) scarpe_dict = dict(news_title=news_title, news_content=news_content, featured_image_url=featured_image_url, weather=weather, facts_html=facts_html, hemisphere_image_urls=hemisphere_image_urls, time=datetime.now().strftime('%Y-%m-%d %H:%M:%S')) return scarpe_dict
def scrape(): executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped url = "https://mars.nasa.gov/news/" browser.visit(url) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = bs(html,"html.parser") # Retrieve elements news_title = soup.find("div",class_="content_title").text news_paragraph = soup.find("div", class_="article_teaser_body").text print(news_title) print(news_paragraph) executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(jpl_url) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = bs(html, "html.parser") # Retrieve elements featured_img= soup.find("div", class_="carousel_items").find("article")["style"] #return with base url featured_image_url = f'https://www.jpl.nasa.gov{featured_img}' print(featured_image_url) # Retrieve elements featured_image= soup.find("div", class_="carousel_items").find("article")["style"] # use split function featured_image_split = featured_image.split("'")[1] #return with base url featured_image_url = f'https://www.jpl.nasa.gov{featured_image_split}' print(featured_image_url) # URL of page to be scraped twitter_url = "https://twitter.com/marswxreport?lang=en" browser.visit(twitter_url) requests.get(twitter_url) response = requests.get(twitter_url) # Parse HTML with Beautiful Soup soup = bs(response.text,"html.parser") # Retrieve elements in text mars_weather = soup.find('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text print(mars_weather) # URL of page to be scraped facts_url = "https://space-facts.com/mars/" #create dataframe facts_df = pd.read_html(facts_url) facts_df = pd.DataFrame(facts_df[0]) facts_df.head(9) #Use Pandas to convert the data to a HTML table string. facts_df_html = facts_df.to_html() facts_df_html # URL of page to be scraped #hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" #browser.visit(hemisphere_url) # HTML object #html = browser.html # Parse HTML with Beautiful Soup #soup = bs(html, "html.parser") #hemisphere = [] # Retrieve elements #results = soup.find_all("div", class_="item") # Loop through results #for result in results: #hemisphere_dict = {} # Use Beautiful Soup's find() method to navigate and retrieve attributes #h3 = result.find("h3").text #href = result.find("div", class_="description").a["href"] #title = 'https://astrogeology.usgs.gov' + href #browser.visit(title) # HTML object #html = browser.html # Parse HTML with Beautiful Soup #soup = bs(html, "html.parser") # Retrieve elements #url = soup.find("img", class_="wide-image")["src"] #hemisphere_dict["title"] = h3 #hemisphere_dict["img_url"] = 'https://astrogeology.usgs.gov' + url #print(hemisphere_dict["img_url"]) #hemisphere.append(hemisphere_dict) #hemisphere mars = { "news_title": news_title, "news_paragraph": news_paragraph, "featured_image_url": featured_image_url, "facts_df_html": facts_df_html, "mars_weather": mars_weather, } return mars
] # load web pages without loading images in selenium from selenium import webdriver chromeOptions = webdriver.ChromeOptions() prefs = { 'profile.managed_default_content_settings.images': 2, 'disk-cache-size': 4096 } chromeOptions.add_experimental_option("prefs", prefs) driver = webdriver.Chrome(chrome_options=chromeOptions) executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False, options=chromeOptions) # visit main page first main_url = 'https://www.basketball-reference.com/' browser.visit(main_url) # wait before going through loop of each team's stats time.sleep(60) # loop to start the scraping of the stats team_list = [] x = 0 while x == 0: try: for team in teams: # URL
def scrape(): from splinter import Browser from bs4 import BeautifulSoup as bs import time import pandas as pd import requests executable_path = {"executable_path": "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False) mars_info = {} ##### NASA Mars News ##### url_news = "https://mars.nasa.gov/news/" browser.visit(url_news) html_news = browser.html soup_news = bs(html_news, "html.parser") news_title = soup_news.find("div", class_="content_title").text news_p = soup_news.find("div", class_="article_teaser_body").text mars_info["news_title"] = news_title mars_info["news_p"] = news_p ##### JPL Mars Space Images - Featured Image ##### url_jpl = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_jpl) browser.click_link_by_partial_text("FULL IMAGE") time.sleep(3) browser.click_link_by_partial_text("more info") html_img = browser.html soup_img = bs(html_img, "html.parser") featured_image = soup_img.find("figure").find("a")["href"] featured_image_url = f"https://www.jpl.nasa.gov{featured_image}" #print(featured_image_url) mars_info["featured_image_url"] = featured_image_url ##### Mars Weather ##### url_twt = "https://twitter.com/marswxreport?lang=en" response = requests.get(url_twt) soup_twt = bs(response.text, "html.parser") mars_weather = soup_twt.find( "div", class_="js-tweet-text-container").text.strip() mars_info["mars_weather"] = mars_weather ##### Mars Facts ##### url_facts = "https://space-facts.com/mars/" facts = pd.read_html(url_facts) facts_df = facts[0] facts_df.columns = ["description", "value"] facts_df = facts_df.set_index("description") facts_html = facts_df.to_html().strip() mars_info["facts_html"] = facts_html ##### Mars Hemispheres ##### url_astro = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_astro) html_astro = browser.html soup_astro = bs(html_astro, "html.parser") hemisphere_image_urls = [] hemis = soup_astro.find_all("div", class_="description") for hemi in hemis: title = hemi.find("h3").text next_page = hemi.find("a")["href"] browser.visit(f"https://astrogeology.usgs.gov{next_page}") html_hemi = browser.html soup_hemi = bs(html_hemi, "html.parser") img_url = soup_hemi.find("div", class_="downloads").find("a")["href"] hemisphere_image_urls.append({"title": title, "img_url": img_url}) mars_info["hemisphere_image_urls"] = hemisphere_image_urls browser.quit() return mars_info
def main(): browser = Browser( 'chrome', headless=True, user_agent= "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" ) url = 'https://www.toutiao.com/c/user/5551493118/#mid=5551493118' get_full_page(browser, url) name = browser.find_by_css('span[class="name"]')[0].text #get author anme contents = browser.find_by_css( 'div[class="ugc-content"]') #get contents of each blog time_stamps = browser.find_by_css( 'span[class="lbtn"]') #get time stamps of each results = browser.find_by_css( 'div[class="y-left"] a') #get views, likes, comments num_of_toutiao = len(contents) total_num_of_likes = 0 total_num_of_comments = 0 total_num_of_views = 0.0 # extract the data from the strings print("parsing the data") for result in results: if "阅读" in result.text: if "万" in result.text: total_num_of_views += float(extract_num( result.text)[0]) + float(extract_num(result.text)[1]) / 10 else: total_num_of_views += float(extract_num( result.text)[0]) / 10000 elif "赞" in result.text: total_num_of_likes += int(extract_num(result.text)[0]) elif "评论" in result.text: total_num_of_comments += int(extract_num(result.text)[0]) total_num_of_views = round(total_num_of_views, 4) avg_num_of_likes = round(total_num_of_likes / num_of_toutiao, 2) avg_num_of_comments = round(total_num_of_comments / num_of_toutiao, 2) avg_num_of_views = round(total_num_of_views / num_of_toutiao, 2) # write data to csv file print("writing the data to the csv file") with open('tou_tiao2.csv', mode='w', encoding="utf-8-sig") as file: file = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) x = 0 for i in range(num_of_toutiao): content = contents[i].text view = results[x].text like = results[x + 1].text[3:] comment = results[x + 2].text[3:] time_stamp = time_stamps[i].text[3:] file.writerow([name, content, view, like, comment, time_stamp]) x += 3 file.writerow([ "总阅读数: " + str(total_num_of_views) + "万", "总赞数: " + str(total_num_of_likes), "总评论数: " + str(total_num_of_comments) ]) file.writerow([ "平均阅读数: " + str(avg_num_of_views) + "万", "平均赞数: " + str(avg_num_of_likes), "平均评论数: " + str(avg_num_of_comments) ]) print("平均阅读数:", avg_num_of_views, "万 ", "平均赞数:", avg_num_of_likes, "平均评论数:", avg_num_of_comments) print("总阅读数:", total_num_of_views, "万 ", "总赞数:", total_num_of_likes, "总评论数:", total_num_of_comments)
def scrape(): mars_news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' executable_path = {"executable_path": "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False) first = None #for some reason it doesn't work everytime, so it will just keep trying! #while first is None: browser.visit(mars_news_url) html = browser.html more_soup = BeautifulSoup(html, 'html.parser') first = more_soup.find('li', class_='slide') if first is None: return_this = { 'news_title': 'Something went wrong talking to Nasa!', 'news_summary': "For some reason when using scrape from the python file, it doesn't find any html for the page." } else: news_title = first.h3.text news_summary = first.find('div', class_='rollover_description_inner').text return_this = {"news_title": news_title, 'news_summary': news_summary} perseverance_image_url = 'https://www.nasa.gov/perseverance/images' try: browser.visit(perseverance_image_url) image_html = browser.html image_soup = BeautifulSoup(image_html, 'html.parser') images = image_soup.find('div', class_='is-gallery') first_img = images.find('div', class_='image') first_img_href = first_img.find('img')['src'] return_this.update( {'perseverance_image': 'https://www.nasa.gov' + first_img_href}) except: pass browser.quit() facts_url = 'https://space-facts.com/mars/' tables = pd.read_html(facts_url) df = tables[0] df = df.rename(columns={0: '', 1: 'Mars'}) facts_table = df.to_html(index=False, classes='table table-striped', justify='left') return_this.update({"data_table": facts_table}) hemisphere_image_urls = [{ 'title': 'Cerberus Hemisphere', 'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg' }, { 'title': 'Schiaparelli Hemisphere', 'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg' }, { 'title': 'Syrtis Major Hemisphere', 'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg' }, { 'title': 'Valles Marineris Hemisphere', 'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg' }] return_this.update({'hemisphere_image_urls': hemisphere_image_urls}) return return_this
def init_browser(): executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False)
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # create mars_data dict that we can insert into mongo mars_data = {} # visit NASA Mars News site and scrape headlines news_url = "https://mars.nasa.gov/news/" browser.visit(news_url) time.sleep(1) news_html = browser.html news_soup = bs(news_html, 'html.parser') # Scrape the Latest News Title slide_element = news_soup.select_one("ul.item_list li.slide") #slide_element.find("div", class_="content_title") news_title = slide_element.find("div", class_="content_title").get_text() print(f"The latest news title is: {news_title}") # Scrape the Latest Paragraph Text news_paragraph = slide_element.find( "div", class_="article_teaser_body").get_text() print(f"The lanews_paragraphtest news paragraph is: {news_paragraph}") # Visit the JPL website and scrape the featured image jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl_url) time.sleep(1) # Ask Splinter to Go to Site and Click Button with Class Name full_image full_image_button = browser.find_by_id("full_image") full_image_button.click() # Find "More Info" Button and Click It browser.is_element_present_by_text("more info", wait_time=1) more_info_element = browser.find_link_by_partial_text("more info") more_info_element.click() # Parse Results HTML with BeautifulSoup html = browser.html image_soup = bs(html, "html.parser") img_url = image_soup.select_one("figure.lede a img").get("src") img_url # Use Base URL to Create Absolute URL img_url = f"https://www.jpl.nasa.gov{img_url}" print(img_url) mars_df = pd.read_html("https://space-facts.com/mars/")[0] mars_df.columns = ["Description", "Value"] mars_df.set_index("Description", inplace=True) mars_df_html = mars_df.to_html(header=False, index=False) hemisphere_image_urls = [{ "title": "Cerberus Hemisphere", "img_url": "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg" }, { "title": "Valles Marineris Hemisphere", "img_url": "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg" }, { "title": "Schiaparelli Hemisphere", "img_url": "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg" }, { "title": "Syrtis Major Hemisphere", "img_url": "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg" }] scrape_data = { "news_title": news_title, "news_paragraph": news_paragraph, "image_URL": img_url, "mars_data": mars_df_html, "hemisphere_image": hemisphere_image_urls } return scrape_data
import selenium import splinter from splinter import Browser import csv import sys from selenium import webdriver driver = webdriver.Firefox(executable_path=r'F:\python\Scripts\geckodriver') browser = Browser('firefox') ''' ###########STEP1 GET THE URLS OF RATING PAGES########## f = open('c:\\Users\\Administrator\\url.txt', 'r')# url of the search results with open("rating_url.csv",'w',encoding='utf-8') as fileout: writer = csv.writer(fileout) for url in f: driver.get(url) div = driver.find_elements_by_xpath('//*[@class="charity-name-desktop"]') for j in div: a = j.find_element_by_tag_name('a') href = a.get_attribute('href') writer.writerow(href) ######After this, I use excel to replace 'summary' with 'history' in the links to create history_page_url and save them into 'uu.txt'.(Save the 'click'.) ######If you don't have names of the list of charities,then can use this to extract names from search result page: names = browser.find_by_xpath('//*[@class="charity-name-desktop"]') for j in names: obj = [j.value] ''' ##########STEP2 SAVE THE HISTORICAL RATINGS######## def rat(): name = browser.find_by_xpath('//*[@class="charityname"]')
def init_browser(): if platform == "darwin": executable_path = {"executable_path": "/usr/local/bin/chromedriver"} else: executable_path = {'executable_path': 'chromedriver.exe'} return Browser("chrome", **executable_path, headless=True)
def init_browser(): executable_path = {'executable_path': 'chromedriver.exe'} return Browser('chrome', **executable_path)
def scrape(): from splinter import Browser from bs4 import BeautifulSoup import pandas as pd import time executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) time.sleep(3) url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='article_teaser_body').text url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(1) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(1) featured_image_url = browser.find_by_css('.fancybox-image')['src'] url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) time.sleep(1) mars_weather = '' i = 0 while 'InSight' not in mars_weather: mars_weather = browser.find_by_css( '.js-tweet-text-container')[i].find_by_tag('p').text i += 1 url = 'https://space-facts.com/mars/' mars_facts = pd.read_html(url)[0] mars_facts.columns = ['a', 'b'] mars_facts_df = mars_facts.set_index('a') mars_facts_df.index.names = [''] mars_dict = {} for row in mars_facts_df.iterrows(): mars_dict[row[0][:-1]] = row[1][0] hemisphere_image_urls = [ { "title": "Valles Marineris Hemisphere", "img_url": "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg" }, { "title": "Cerberus Hemisphere", "img_url": "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg" }, { "title": "Schiaparelli Hemisphere", "img_url": "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg" }, { "title": "Syrtis Major Hemisphere", "img_url": "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg" }, ] browser.quit() dict_out = { 'news_title': news_title, 'news_p': news_p, 'featured_image': featured_image_url, 'mars_weather': mars_weather, 'mars_facts': mars_dict, 'hemisphere_imgs': hemisphere_image_urls } return dict_out
def scrape(): #scrape the NASA Mars News SIte, collect news title, paragraph text, assign #to variables for later reference url = "https://mars.nasa.gov/news/" response = req.get(url) soup = bs(response.text, 'html5lib') #scrape the title and accompanying paragraph news_title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="rollover_description_inner").text #set up splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) #stir soup for scraping html = browser.html soup = bs(html, "html.parser") #have webdriver click links to get to the full image I want browser.click_link_by_partial_text('FULL IMAGE') #had to add this, wasn't working and docs recommended waiting between clicks time.sleep(5) browser.click_link_by_partial_text('more info') #stir new soup for scraping the image url new_html = browser.html new_soup = bs(new_html, 'html.parser') temp_img_url = new_soup.find('img', class_='main_image') back_half_img_url = temp_img_url.get('src') recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA #stir soup twitter_response = req.get("https://twitter.com/marswxreport?lang=en") twitter_soup = bs(twitter_response.text, 'html.parser') #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol" tweet_containers = twitter_soup.find_all('div', class_="js-tweet-text-container") for i in range(10): tweets = tweet_containers[i].text if "Sol " in tweets: mars_weather = tweets break #Mars Facts....visit webpage, use pandas to scrape the page for facts, #convert pandas table to html table string. request_mars_space_facts = req.get("https://space-facts.com/mars/") #use pandas to scrape html table data mars_space_table_read = pd.read_html(request_mars_space_facts.text) df = mars_space_table_read[0] #set the index to the titles of each statistic/value df.set_index(0, inplace=True) mars_data_df = df #convert new pandas df to html, replace "\n" to get html code mars_data_html = mars_data_df.to_html() mars_data_html.replace('\n', '') mars_data_df.to_html('mars_table.html') #..Visit the USGS Astrogeology site to obtain hgih resolution images for #....each of Mar's hemispheres usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" usgs_req = req.get(usgs_url) #..You will need to click each of the links to the hemispheres in order #....to find full res image #had trouble doing this with splinter, decided to just do a bunch of loops for img urls soup = bs(usgs_req.text, "html.parser") hemi_attributes_list = soup.find_all('a', class_="item product-item") #list to keep the dictionaries that have title and image url hemisphere_image_urls = [] for hemi_img in hemi_attributes_list: #get the img title img_title = hemi_img.find('h3').text #print(img_title) #get the link to stir another soup, this is the page with the actual image url link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href'] #print(link_to_img) img_request = req.get(link_to_img) soup = bs(img_request.text, 'lxml') img_tag = soup.find('div', class_='downloads') img_url = img_tag.find('a')['href'] hemisphere_image_urls.append({ "Title": img_title, "Image_Url": img_url }) mars_data = { "News_Title": news_title, "Paragraph_Text": paragraph_text, "Most_Recent_Mars_Image": recent_mars_image_url, "Mars_Weather": mars_weather, "mars_h": hemisphere_image_urls } return mars_data
def scrape(): # Chromedriver execution executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=True) # URL path url1 = "https://mars.nasa.gov/news/" browser.visit(url1) # Save html and parser html = browser.html soup = bs(html, "html.parser") # get first news date from the url news_date = soup.find('li', class_='slide').find('div', class_="list_date").text # get first news title from the url news_title = soup.find('div', class_="list_text").find( 'div', class_="content_title").text # get first news text from the url news_text = soup.find('div', class_="list_text").find( 'div', class_="article_teaser_body").text # URL path url2 = "https://www.jpl.nasa.gov/spaceimages/" # Visiting url2 to click and response browser.visit(url2) browser.find_by_id('full_image').click() time.sleep(3) # Clicking on more info button browser.links.find_by_partial_text('more info').click() # Getting image URL featured_image_url = browser.find_by_xpath( "//img[@class='main_image']")._element.get_attribute("src") # URL path url3 = "https://space-facts.com/mars/" # Finding all tables on a web page table = pd.read_html(url3) # Pick first table (Mars facts) table[0].columns = ['Parameter', 'Value'] fact_table = table[0] # Converting DataFrame to HTML table table_html = fact_table.to_html() # Getting mars facts table data from the web page browser.visit(url3) html = browser.html soup = bs(html, "html.parser") tables = soup.findChildren('table') table_data = [] table1 = tables[0] rows = table1.findChildren(['th', 'tr']) for row in rows: title = row.find('td', class_="column-1").text.strip() value = row.find('td', class_="column-2").text.strip() table_data.append({'title': title, 'value': value}) table_data # URL path url4 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url4) browser.url html = browser.html # Parsering and scrapping list of images soup = bs(html, "html.parser") images = soup.find_all('div', class_="description") link = f"https://astrogeology.usgs.gov" time.sleep(3) # Looping thorugh images list, pick href and add it to link, visit new link, scrap for image url and title, append to a list hem_img_urls = [] for image in images: img_link = f"{link}{image.find('a')['href']}" browser.visit(img_link) img_url = browser.find_by_xpath( "//img[@class='wide-image']")._element.get_attribute("src") title = browser.find_by_xpath("//h2[@class='title']").text title = title.rstrip('Enhanced') hem_img_urls.append({"title": title, "img_url": img_url}) hem_img_urls time.sleep(3) # DataBase dictionary mars_web_dict = { 'news_date': news_date, 'news_title': news_title, 'news_text': news_text, 'featured_image_url': featured_image_url, 'row1_title': table_data[0]['title'], 'row1_value': table_data[0]['value'], 'row2_title': table_data[1]['title'], 'row2_value': table_data[1]['value'], 'row3_title': table_data[2]['title'], 'row3_value': table_data[2]['value'], 'row4_title': table_data[3]['title'], 'row4_value': table_data[3]['value'], 'row5_title': table_data[4]['title'], 'row5_value': table_data[4]['value'], 'row6_title': table_data[5]['title'], 'row6_value': table_data[5]['value'], 'row7_title': table_data[6]['title'], 'row7_value': table_data[6]['value'], 'row8_title': table_data[7]['title'], 'row8_value': table_data[7]['value'], 'row9_title': table_data[8]['title'], 'row9_value': table_data[8]['value'], 'url1_title': hem_img_urls[0]['title'], 'url1_img': hem_img_urls[0]['img_url'], 'url2_title': hem_img_urls[1]['title'], 'url2_img': hem_img_urls[1]['img_url'], 'url3_title': hem_img_urls[2]['title'], 'url3_img': hem_img_urls[2]['img_url'], 'url4_title': hem_img_urls[3]['title'], 'url4_img': hem_img_urls[3]['img_url'] } browser.quit() return mars_web_dict
def firefox_installed(): try: Browser("firefox") except OSError: return False return True
def init_browser(): executable_path = {"executable_path": "./chromedriver"} return Browser("chrome", **executable_path, headless=False)
def setUpClass(cls): cls.browser = Browser("firefox")
def scrape(): mars_data= {} # browser = init_browser() # mars_dict = {} #import pdb;pdb.set_trace() executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) # # NASA Mars News # URL of page to be scraped url = 'https://mars.nasa.gov/news/' browser.visit(url) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') #News Title news_title = soup.find('div', class_="bottom_gradient").text print(news_title) #Paragraph text news_p = soup.find('div', class_='article_teaser_body').text # print('--------------------------------------------------') print(news_p) # Add the news title and summary to the dictionary mars_data["news_title"] = news_title mars_data["new_p"] = news_p # # Featured Image #import pdb; pdb.set_trace() Image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(Image_url) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') time.sleep(5) html=browser.html soup=BeautifulSoup(html,'html.parser') # Extracting image Image_path= soup.find('figure',class_='lede').a['href'] featured_image_url = 'https://www.jpl.nasa.gov/'+ Image_path print(featured_image_url) # Add the featured image url to the dictionary mars_data["featured_image_url"] = featured_image_url # # Mars Weather mars_tweet = 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_tweet) html=browser.html soup=BeautifulSoup(html,'html.parser') # Extracting tweet mars_weather = soup.find('div',class_='js-tweet-text-container').text.replace('\n','') print(mars_weather) # Add the weather to the dictionary mars_data["mars_weather"] = mars_weather # # Mars Facts mars_fact='https://space-facts.com/mars/' browser.visit(mars_fact) html=browser.html soup=BeautifulSoup(html,'html.parser') # Extracting mars table #set up lists to hold td elements which alternate between label and value trs=soup.find_all('tr') #set up lists to hold td elements which alternate between label and value labels = [] values = [] #for each tr element append the first td element to labels and the second to values for tr in trs: td_elements = tr.find_all('td') labels.append(td_elements[0].text) values.append(td_elements[1].text) print(labels,values) mars_fact_tabel = pd.DataFrame({ "Label": labels, "Values": values }) #mars_fact_tabel # convert the data to a HTML table string fact_table = mars_fact_tabel.to_html(header = False, index = False) print(fact_table) # Add the Mars facts table to the dictionary mars_data["mars_table"] = fact_table # # Mars Hemispheres USGS_site= 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(USGS_site) html=browser.html soup=BeautifulSoup(html,'html.parser') # Get the div element that holds the images. images = soup.find('div', class_='collapsible results') #Loop through the class="item" by clicking the h3 tag and getting the title and url. hemispheres_image_urls = [] # print(len(images.find_all("div", class_="item"))) for i in range(len(images.find_all("div", class_="item"))): # print(i) time.sleep(5) image = browser.find_by_tag('h3') image[i].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') title = soup.find("h2", class_="title").text # print(title) div = soup.find("div", class_="downloads") # for li in div: link = div.find('a') # print(link) url = link.attrs['href'] # print(url) hemispheres = { 'title' : title, 'img_url' : url } hemispheres_image_urls.append(hemispheres) browser.back() print(hemispheres_image_urls) # Add the hemispheres data to the dictionary mars_data["hemispheres_image_urls"] = hemispheres_image_urls # Return the dictionary return mars_data
def test_should_support_with_statement(self): with Browser('firefox'): pass
def scrape(): #Replace the path with actual path to the chromedriver executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) #mars.nasa.gov/news/ url = 'https://mars.nasa.gov/news/' browser.visit(url) #Scrape page into Soup html = browser.html nasa_soup = bs(html, 'html.parser') #Returned results summary = nasa_soup.find('div', class_="rollover_description_inner").text title = nasa_soup.find('div', class_="content_title").text print(f"Title: {title}") print(f"Summary: {summary}") #Visit URL for JPL Featured Space Image https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars) url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) html = browser.html space_soup = bs(html, 'html.parser') #using splinter to find mars featured image image = space_soup.find('a', class_='fancybox') ['data-fancybox-href'] image_url = 'https://www.jpl.nasa.gov' + image print(image_url) #Mars weather tweets tweet_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(tweet_url) try: html = browser.html weather_soup = bs(html, 'html.parser') #save tweet mars_weather = weather_soup.find('p', class_ = "TweetTextSize").text print(mars_weather) except Exception as e: print(e) mars_weather = "Latest Mars Weather Tweet not Available. Try again later." #Mars facts scrape facts_url = 'https://space-facts.com/mars/' mars_info = pd.read_html(facts_url)[0].to_html(index=False, header=False) mars_info #hemispheres photo scraping base_hemisphere_url = "https://astrogeology.usgs.gov" hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemisphere_url) html = browser.html soup = bs(html, 'html.parser') hemisphere_image_urls = [] links = soup.find_all("div", class_="item") for link in links: img_dict = {} title = link.find("h3").text next_link = link.find("div", class_="description").a["href"] full_next_link = base_hemisphere_url + next_link browser.visit(full_next_link) pic_html = browser.html pic_soup = bs(pic_html, 'html.parser') url = pic_soup.find("img", class_= "wide-image")["src"] img_dict["title"] = title img_dict["img_url"] = base_hemisphere_url + url print(img_dict["img_url"]) hemisphere_image_urls.append(img_dict) mars_data = { "title": title, "summary": summary, "image_url": image_url, "mars_weather": mars_weather, "mars_info": mars_info, "hemisphere_image_urls": hemisphere_image_urls } return mars_data
def setUpClass(cls): extension_path = os.path.join( os.path.abspath(os.path.dirname(__file__)), 'firebug.xpi') cls.browser = Browser('firefox', extensions=[extension_path])
#print("added to db") print(mars_news) #Stick it all into Mongo #Collect it all up #collection.insert_one(mars_news) #Ok, let's use splinter. #Get current featured image url from here: #https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars #import splinter, etc from splinter import Browser executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #go to the URL url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) #Navigate to the full image browser.click_link_by_partial_text('FULL IMAGE') #soup it up image_html = browser.html image_soup = BeautifulSoup(image_html, 'html.parser') #grab that image image_ext = image_soup.find('img', {'class': 'fancybox-image'})['src']
def setUpClass(cls): preferences = { 'dom.max_script_run_time': 360, 'devtools.inspector.enabled': True, } cls.browser = Browser("firefox", profile_preferences=preferences)
def init_browser(): '''Initialize a splinter Chrome browser''' return Browser("chrome", headless=True)
def setUpClass(cls): cls.browser = Browser("firefox", fullscreen=True)
def init_browser(): # @NOTE: Replace the path with your actual path to the chromedriver executable_path = {"executable_path": "/usr/local/bin/chromedriver"} return Browser("chrome", **executable_path, headless=False)
def init_browser(): executable_path = { 'executable_path': '/Users/abhsharm/Softwares/Drivers/chromedriver' } return Browser('chrome', **executable_path, headless=False)