Exemple #1
0
def scrape():
    #scrape the NASA Mars News SIte, collect news title, paragraph text, assign
    #to variables for later reference
    url = "https://mars.nasa.gov/news/"
    response = req.get(url)
    soup = bs(response.text, 'html5lib')

    #Scrape for news item
    news_title = soup.find("div", class_="content_title").text
    paragraph_text = soup.find("div", class_="rollover_description_inner").text

    # JPL's Space images

    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    #call soup
    html = browser.html
    soup = bs(html, "html.parser")

    #auto click through to full image
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(3)
    browser.click_link_by_partial_text('more info')

    #soup gets image url
    new_html = browser.html
    new_soup = bs(new_html, 'html.parser')
    temp_img_url = new_soup.find('img', class_='main_image')
    recent_mars_image_url = "https://www.imagecache.jpl.nasa.gov/images/640x350/PIA18605-16-640x350.jpg"

    #getdata from Twitter for Mars Weather
    twitter_req = req.get("https://twitter.com/marswxreport?lang=en")
    twitter_bs = bs(twitter_req.text, 'html.parser')

    tweet_output = twitter_bs.find_all('div', class_="js-tweet-text-container")

    for i in range(10):
        tweets = tweet_output[i].text
        if "Sol " in tweets:
            mars_weather = tweets
            break

#MARS FACTS.
    request_mars_facts = req.get("https://space-facts.com/mars/")

    mars_table = pd.read_html(request_mars_facts.text)
    mars_df = mars_table[0]

    mars_df.set_index(0, inplace=True)
    mars_df2 = mars_df

    mars_data_html = mars_df2.to_html()
    mars_data_html.replace('\n', '')
    mars_df2.to_html('mars_table.html')

    #Get pics of Mars' hemispheres
    usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    usgs_req = req.get(usgs_url)

    soup = bs(usgs_req.text, "html.parser")
    hemis_list = soup.find_all('a', class_="itemLink product-item")

    hemisphere_image_urls = []
    for hemi_img in hemis_list:
        img_title = hemi_img.find('h3').text
        link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href']
        img_request = req.get(link_to_img)
        soup = bs(img_request.text, 'lxml')
        img_tag = soup.find('div', class_='downloads')
        img_url = img_tag.find('a')['href']
        hemisphere_image_urls.append({
            "Title": img_title,
            "Image_Url": img_url
        })

    mars_data = {
        "News_Title": news_title,
        "Paragraph_Text": paragraph_text,
        "Most_Recent_Mars_Image": recent_mars_image_url,
        "Mars_Weather": mars_weather,
        "mars_h": hemisphere_image_urls
    }

    return mars_data
    #gather all of the links
    parent_url = 'https://astrogeology.usgs.gov/'
    hemisphere_image_title = []
    hem_name = soup.find_all('h3')

for link in hem_name:
    hemisphere_image_title.append(link.text)

    #%%
    hemisphere_image_url = []

for hem in hemisphere_image_title:
    hem_dict = {'title': [], 'img_url': []}

    #find your image
    browser.click_link_by_partial_text(hem)

    url = browser.find_by_text('Sample')['href']

    hem_dict['img_url'] = url

    hem_dict['title'] = hem

    hemisphere_image_url.append(hem_dict)

    browser.visit(mars_hemispheres_url)

    #%%
    mars = {
        "featured_image": image_url,
        "mars_weather": mars_weather_tweet,
Exemple #3
0
def scrape():
    #scrape the NASA Mars News SIte, collect news title, paragraph text, assign
    #to variables for later reference
    url = "https://mars.nasa.gov/news/"
    response = req.get(url)
    soup = bs(response.text, 'html5lib')

    #scrape the title and accompanying paragraph
    news_title = soup.find("div", class_="content_title").text
    paragraph_text = soup.find("div", class_="rollover_description_inner").text

    #Visit the URL for JPL's Space images
    #splinter to navigate the site and find the image url for the current featured
    #image and assign it to featured_image_url (use .jpg)

    #set up splinter
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    #stir soup for scraping
    html = browser.html
    soup = bs(html, "html.parser")

    #have webdriver click links to get to the full image I want
    browser.click_link_by_partial_text('FULL IMAGE')

    #had to add this, wasn't working and docs recommended waiting between clicks
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    #stir new soup for scraping the image url
    new_html = browser.html
    new_soup = bs(new_html, 'html.parser')
    temp_img_url = new_soup.find('img', class_='main_image')
    back_half_img_url = temp_img_url.get('src')

    recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url

    #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA
    #stir soup
    twitter_response = req.get("https://twitter.com/marswxreport?lang=en")
    twitter_soup = bs(twitter_response.text, 'html.parser')

    #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol"
    tweet_containers = twitter_soup.find_all('div',
                                             class_="js-tweet-text-container")
    for i in range(10):
        tweets = tweet_containers[i].text
        if "Sol " in tweets:
            mars_weather = tweets
            break

#Mars Facts....visit webpage, use pandas to scrape the page for facts,
#convert pandas table to html table string.
    request_mars_space_facts = req.get("https://space-facts.com/mars/")

    #use pandas to scrape html table data
    mars_space_table_read = pd.read_html(request_mars_space_facts.text)
    df = mars_space_table_read[0]

    #set the index to the titles of each statistic/value
    df.set_index(0, inplace=True)
    mars_data_df = df

    #convert new pandas df to html, replace "\n" to get html code
    mars_data_html = mars_data_df.to_html()
    mars_data_html.replace('\n', '')
    mars_data_df.to_html('mars_table.html')
Exemple #4
0
def scrape():
    #scrape the NASA Mars News SIte, collect news title, paragraph text, assign
    #to variables for later reference
    url = "https://mars.nasa.gov/news/"
    response = req.get(url)
    soup = bs(response.text, 'html5lib')

    #scrape the title and accompanying paragraph
    news_title = soup.find("div", class_="content_title").text
    paragraph_text = soup.find("div", class_="rollover_description_inner").text

    #Visit the URL for JPL's Space images
    #splinter to navigate the site and find the image url for the current featured
    #image and assign it to featured_image_url (use .jpg)

    #set up splinter
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    #stir soup for scraping
    html = browser.html
    soup = bs(html, "html.parser")

    #have webdriver click links to get to the full image I want
    browser.click_link_by_partial_text('FULL IMAGE')

    #had to add this, wasn't working and docs recommended waiting between clicks
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    #stir new soup for scraping the image url
    new_html = browser.html
    new_soup = bs(new_html, 'html.parser')
    temp_img_url = new_soup.find('img', class_='main_image')
    back_half_img_url = temp_img_url.get('src')

    recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url

    #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA
    #stir soup
    twitter_response = req.get("https://twitter.com/marswxreport?lang=en")
    twitter_soup = bs(twitter_response.text, 'html.parser')

    #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol"
    tweet_containers = twitter_soup.find_all('div',
                                             class_="js-tweet-text-container")
    for i in range(10):
        tweets = tweet_containers[i].text
        if "Sol " in tweets:
            mars_weather = tweets
            break

#Mars Facts....visit webpage, use pandas to scrape the page for facts,
#convert pandas table to html table string.
    request_mars_space_facts = req.get("https://space-facts.com/mars/")

    #use pandas to scrape html table data
    mars_space_table_read = pd.read_html(request_mars_space_facts.text)
    df = mars_space_table_read[0]

    #set the index to the titles of each statistic/value
    df.set_index(0, inplace=True)
    mars_data_df = df

    #convert new pandas df to html, replace "\n" to get html code
    mars_data_html = mars_data_df.to_html()
    mars_data_html.replace('\n', '')
    mars_data_df.to_html('mars_table.html')

    #..Visit the USGS Astrogeology site to obtain hgih resolution images for
    #....each of Mar's hemispheres
    usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    usgs_req = req.get(usgs_url)

    #..You will need to click each of the links to the hemispheres in order
    #....to find full res image

    #had trouble doing this with splinter, decided to just do a bunch of loops for img urls
    soup = bs(usgs_req.text, "html.parser")
    hemi_attributes_list = soup.find_all('a', class_="item product-item")
    #list to keep the dictionaries that have title and image url
    hemisphere_image_urls = []
    for hemi_img in hemi_attributes_list:
        #get the img title
        img_title = hemi_img.find('h3').text
        #print(img_title)
        #get the link to stir another soup, this is the page with the actual image url
        link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href']
        #print(link_to_img)
        img_request = req.get(link_to_img)
        soup = bs(img_request.text, 'lxml')
        img_tag = soup.find('div', class_='downloads')
        img_url = img_tag.find('a')['href']
        hemisphere_image_urls.append({
            "Title": img_title,
            "Image_Url": img_url
        })

    mars_data = {
        "News_Title": news_title,
        "Paragraph_Text": paragraph_text,
        "Most_Recent_Mars_Image": recent_mars_image_url,
        "Mars_Weather": mars_weather,
        "mars_h": hemisphere_image_urls
    }

    return mars_data
def scrape_all():

    browser = init_browser()

    browser.visit('https://mars.nasa.gov/news/')

    html = browser.html
    news_soup = BeautifulSoup(html, 'lxml')

    title = news_soup.find_all('div', class_='content_title')
    #place results in designated variables to be used later
    news_title = title[1].text.strip()
    print(news_title)

    parag = news_soup.find_all('div', class_='article_teaser_body')
    news_p = parag
    print(news_p)

    # JPL Mars Space Images - Featured Image

    browser.visit(
        "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")

    time.sleep(3)

    browser.click_link_by_partial_text('FULL IMAGE')

    browser.click_link_by_partial_text('more info')

    feat_html = browser.html
    feat_soup = BeautifulSoup(feat_html, 'html.parser')

    mars_img_url = feat_soup.find('figure', class_='lede').a['href']

    orig_url = "https://www.jpl.nasa.gov"
    featured_image_url = orig_url + mars_img_url
    print(f"{featured_image_url}")
    time.sleep(2)

    # Mars Facts

    mars_facts_url = 'https://space-facts.com/mars/'

    time.sleep(3)

    tables_found = pd.read_html(mars_facts_url)

    mars_facts_df = tables_found[0]
    mars_facts_df.head()

    #mars_html_table = mars_facts_df.to_html(classes='data table', index=False, header=False, border=0)
    mars_html_table = mars_facts_df.to_html()
    print(mars_html_table)

    # Mars Hemispheres

    #browser = Browser('chrome', **executable_path, headless=False)

    #hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(
        "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    )

    hemis_html = browser.html
    hemis_soup = BeautifulSoup(hemis_html, 'html.parser')

    hemis_orig_url = 'https://astrogeology.usgs.gov'

    hemisphere_urls = []

    hemis_items = hemis_soup.find_all('div', class_='item')

    # FOR loop to process titles and urls in a dictionary
    for item in hemis_items:

        title = item.find('h3').text
        partial_img_url = item.find('a',
                                    class_='itemLink product-item')['href']

        browser.visit(hemis_orig_url + partial_img_url)

        prev_html = browser.html
        hemis_soup = BeautifulSoup(prev_html, 'html.parser')

        img_url = hemis_orig_url + hemis_soup.find('img',
                                                   class_='wide-image')['src']

        hemisphere_urls.append({"title": title, "img_url": img_url})

        #print(f"{hemisphere_urls[item]}")

    # save all the compiled data about mars in a dictionary
    mars_dictionary = {
        "latest_news_title": news_title,
        "latest_news_parag": news_p,
        "JPL_featured_image": featured_image_url,
        "mars_facts_table": mars_html_table,
        "hemisphere_images": hemisphere_urls
    }
    #for debugging only
    # print("this is my mars dictionary")
    # print(f"[latest_news_title]")
    # print(f"[latest_news_parag]")
    # print(f"[JPL_featured_image]")
    # print(f"[mars_facts_table]")
    # print(f"[hemisphere_images]")

    # close browser
    browser.quit()

    return mars_dictionary
def scrape():
    
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    Nasa_News_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'


    browser.visit(Nasa_News_url)


    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')


    #scrape latest news title
    news_title = soup.find_all('div', class_='content_title')
    latest_title = news_title[1].text
    #print(latest_title)


    #scrape latest news article teaser
    news_teaser = soup.find_all('div', class_="article_teaser_body")
    latest_teaser = news_teaser[0].text
    #print(latest_teaser)


    #scrape JPL Mars featured image 
    JPL_Mars_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(JPL_Mars_url)

    #click buttons to load image webpage
    browser.click_link_by_id("full_image")
    time.sleep(3)
    browser.click_link_by_partial_text("more info")


    # make a new soup
    html=browser.html
    soup=BeautifulSoup(html, "html.parser")
    sub_img = soup.find("figure", class_="lede")
    name=sub_img.a["href"]
    featured_image="https://www.jpl.nasa.gov" + name
    #featured_image


    USGS_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(USGS_url)

    html=browser.html
    soup=BeautifulSoup(html, "html.parser")

    hemi_list = []

    hemispheres = soup.find_all("div", class_="item")

    for hemi in hemispheres:
        #for titles
        hemi_div = hemi.find("div", class_="description")
        hemi_title = hemi_div.a.h3.text
        #print(hemi_title)

        #click link for images
        browser.click_link_by_partial_text("Hemisphere Enhanced")
        time.sleep(3)

        # scrape image 
        html=browser.html
        soup_4=BeautifulSoup(html, "html.parser")
        usgs_open = soup_4.find("img", class_="wide-image")
        usgs_src=usgs_open["src"]
        hemi_image_url="https://www.astrogeology.usgs.gov" + usgs_src
        #print(hemi_image)
        hemi_list.append({"title": hemi_title, "img_url": hemi_image_url})
        
    mars_scrape_data = {
        'Latest Headline': latest_title, 
        latest_teaser, 
        'Featured Image': featured_image, 
        hemi_list}

    return mars_scrape_data
Exemple #7
0
#Visit the URL for JPL's Space images
#splinter to navigate the site and find the image url for the current featured
#image and assign it to featured_image_url (use .jpg)
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
featured_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)

# In[129]:

html = browser.html
soup = bs(html, "html.parser")

# In[130]:

browser.click_link_by_partial_text('FULL IMAGE')
#time.sleep(5)

# In[131]:

browser.click_link_by_partial_text('more info')

# In[134]:

new_html = browser.html
new_soup = bs(new_html, 'html.parser')
temp_img_url = new_soup.find('img', class_='main_image')
back_half_img_url = temp_img_url.get('src')

featured_image_url = "https://www.jpl.nasa.gov" + back_half_img_url
Exemple #8
0
def Scrape():

    print("COMMENCING SCRAPE")

    # Empty dictionary
    mars_dict = {}

    # ## NASA Mars News

    # Mars News URL
    url = "https://mars.nasa.gov/news/"

    # Retrieve page with the requests module
    html = requests.get(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = bs(html.text, 'html.parser')

    # Get title & description
    news_title = soup.find('div', 'content_title', 'a').text
    news_p = soup.find('div', 'rollover_description_inner').text

    # Adding to dict
    mars_dict["news_title"] = news_title
    mars_dict["news_p"] = news_p

    print("NEWS TITLE & DESCRIPTION FOR MARS")

    # ## JPL Mars Space Images
    # Setting up splinter
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_image)

    #Getting the base url
    from urllib.parse import urlsplit
    base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(url_image))
    print(base_url)

    #Design an xpath selector to grab the image
    xpath = "//*[@id=\"page\"]/section[3]/div/ul/li[1]/a/div/div[2]/img"
    #Use splinter to click on the mars featured image
    #to bring the full resolution image
    results = browser.find_by_xpath(xpath)
    img = results[0]
    img.click()

    ##get image url using BeautifulSoup
    html_image = browser.html
    soup = bs(html_image, "html.parser")
    img_url = soup.find("img", class_="fancybox-image")["src"]
    featured_image_url = base_url + img_url
    print(featured_image_url)

    mars_dict["featured_image_url"] = featured_image_url

    print("FEATURED IMAGE Mars")

    # ## Mars Weather

    # Dependencies
    import tweepy
    import json

    # Twitter API Keys
    consumer_key = "Ed4RNulN1lp7AbOooHa9STCoU"
    consumer_secret = "P7cUJlmJZq0VaCY0Jg7COliwQqzK0qYEyUF9Y0idx4ujb3ZlW5"
    access_token = "839621358724198402-dzdOsx2WWHrSuBwyNUiqSEnTivHozAZ"
    access_token_secret = "dCZ80uNRbFDjxdU2EckmNiSckdoATach6Q8zb7YYYE5ER"

    # Setup Tweepy API Authentication
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())
    target_user = "******"
    full_tweet = api.user_timeline(target_user, count=1)
    mars_weather = full_tweet[0]['text']

    # Store weather
    #mars_weather = tweet['text']

    mars_dict["mars_weather"] = mars_weather

    print("WEATHER On Mars")

    # ## Mars Facts

    # Mars Facts URL
    url = "https://space-facts.com/mars/"

    # Retrieve page with the requests module
    html = requests.get(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html.text, 'html.parser')

    # Empty dictionary for info
    mars_profile = {}

    # Get info
    results = soup.find('tbody').find_all('tr')

    # Storing profile information
    for result in results:
        key = result.find('td', 'column-1').text.split(":")[0]
        value = result.find('td', 'column-2').text

        mars_profile[key] = value

    # Creating a DataFrame
    profile_df = pd.DataFrame([mars_profile]).T.rename(columns={0: "Value"})
    profile_df.index.rename("Description", inplace=True)

    # Converting to html
    profile_html = "".join(profile_df.to_html().split("\n"))

    # Adding to dictionary
    mars_dict["profile_html"] = profile_html

    print("FACTS ACQUIRED")

    # ## Mars Hemispheres

    # Mars Hemispheres URL
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    # Empty list of image urls
    hemisphere_image_urls = []

    # ### Valles Marineris

    # Setting up splinter
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through pages
    time.sleep(5)
    browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Store link
    valles_link = soup.find('div', 'downloads').a['href']

    # Create dictionary
    valles_marineris = {
        "title": "Valles Marineris Hemisphere",
        "img_url": valles_link
    }

    # Appending dictionary
    hemisphere_image_urls.append(valles_marineris)

    # ### Cerberus

    # Setting up splinter
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through pages
    time.sleep(5)
    browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Store link
    cerberus_link = soup.find('div', 'downloads').a['href']

    # Create dictionary
    cerberus = {"title": "Cerberus Hemisphere", "img_url": cerberus_link}

    # Appending dictionary
    hemisphere_image_urls.append(cerberus)

    # ### Schiaparelli

    # Setting up splinter
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through pages
    time.sleep(5)
    browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Store link
    schiaparelli_link = soup.find('div', 'downloads').a['href']

    # Create dictionary
    schiaparelli = {
        "title": "Schiaparelli Hemisphere",
        "img_url": schiaparelli_link
    }

    # Appending dictionary
    hemisphere_image_urls.append(schiaparelli)

    # ### Syrtis Major

    # Setting up splinter
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    browser.visit(url)

    # Moving through pages
    time.sleep(5)
    browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    time.sleep(5)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Store link
    syrtis_link = soup.find('div', 'downloads').a['href']

    # Create dictionary
    syrtis_major = {"title": "Syrtis Major Hemisphere", "img_url": syrtis_link}

    # Appending dictionary
    hemisphere_image_urls.append(syrtis_major)

    # Adding to dictionary
    mars_dict["hemisphere_image_urls"] = hemisphere_image_urls

    print("HEMISPHERE IMAGES ACQUIRED")
    print("----------------------------------")
    print("SCRAPING COMPLETED")

    return mars_dict