Ejemplo n.º 1
0
def scrape_mars_weather():
    try:
        browser = initialize_browser()

        weather_url = "https://twitter.com/marswxreport?lang=en"
        browser.visit(weather_url)

        html_weather = browser.html
        soup = bs(html_weather, "html.parser")

        recent_tweets = soup.find_all("div", class_="js-tweet-text-container")

        for tweet in recent_tweets:
            weather_tweet = tweet.find("p").text
            if "Sol" and "pressure" in weather_tweet:
                print(weather_tweet)
                break
            else:
                pass

        mars_information["weather_tweet"] = weather_tweet

        return mars_information

    finally:
        browser.quit()
Ejemplo n.º 2
0
 def test_webdriverchrome_should_be_deprecated(self):
     with warnings.catch_warnings(record=True) as warnings_list:
         warnings.simplefilter('default')
         from splinter.browser import Browser
         browser = Browser('webdriver.chrome')
         browser.quit()
         warning_message = warnings_list[0].message.args[0]
         self.assertEquals("'webdriver.chrome' is deprecated, use just 'chrome'", warning_message)
Ejemplo n.º 3
0
 def test_webdriverchrome_should_be_deprecated(self):
     with warnings.catch_warnings(record=True) as warnings_list:
         warnings.simplefilter('default')
         from splinter.browser import Browser
         browser = Browser('webdriver.chrome')
         browser.quit()
         warning_message = warnings_list[0].message.args[0]
         self.assertEquals("'webdriver.chrome' is deprecated, use just 'chrome'", warning_message)
Ejemplo n.º 4
0
def scrape_mars_facts():
    try:
        browser = initialize_browser

        facts_url = "https://space-facts.com/mars/"
        table = pd.read_html(facts_url)
        mars_facts_df = table[0]
        mars_facts_df.columns = ["Measurement", "Value"]
        mars_facts_df.set_index("Measurement", inplace=True)
        data = mars_facts_df.to_html()
        mars_information["table"] = data

        return mars_information

    finally:
        browser.quit()
Ejemplo n.º 5
0
def scrape_all():
    # Initiate headless driver for deployment
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)

    news_title, news_paragraph = mars_news(browser)

    # Run all scraping functions and store results in dictionary
    data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "featured_image": featured_image(browser),
        "facts": mars_facts(),
        "last_modified": dt.datetime.now(),
        "hemispheres": hemispheres(browser)
    }

    # Stop webdriver and return data
    browser.quit()
    return data
Ejemplo n.º 6
0
def scrape_mars_news():
    try:
        browser = initialize_browser()

        url = "https://mars.nasa.gov/news/"
        browser.visit(url)

        html = browser.html
        soup = bs(html, "html.parser")

        news_headline = soup.find("div", class_="content_title").find("a").text
        news_story = soup.find("div", class_="article_teaser_body").text

        mars_information["news_headline"] = news_headline
        mars_information["news_story"] = news_story

        return mars_information

    finally:
        browser.quit()
Ejemplo n.º 7
0
def scrape_all():
    # Initiate headless driver for deployment (initialize the browser)
    browser = Browser("chrome", executable_path="chromedriver", headless=False)
    # set news title and paragraph variables
    news_title, news_paragraph = mars_news(browser)

    # Run all scraping functions and store results in a dictionary (create data dictionary)
    data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "featured_image": featured_image(browser),
        "facts": mars_facts(),
        "weather": mars_weather(browser),
        "hemisphere_title": title,
        "hemispheres": hemisphere_image_urls,
        "last_modified": dt.datetime.now()
    }
    # Stop webdriver and return scraped data
    browser.quit()
    return data
Ejemplo n.º 8
0
def scrape_mars_image():
    try:
        browser = initialize_browser()

        url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
        browser.visit(url_image)

        html_image = browser.html
        soup = bs(html_image, "html.parser")

        featured_image_url = soup.find("article")["style"].replace(
            "background-image: url(", "").replace(");", "")[1:-1]
        main_url = "https://www.jpl.nasa.gov"
        featured_image_url = main_url + featured_image_url
        featured_image_url
        mars_information["featured_image_url"] = featured_image_url

        return mars_information

    finally:
        browser.quit()
Ejemplo n.º 9
0
def scrape():

    dict_data = {}
    browser = init_browser()

    url = "https://mars.nasa.gov/news/"
    jpl_url = "https://www.jpl.nasa.gov/images?search=&category=Mars"
    mars_url = "https://space-facts.com/mars/"
    hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    titles_body_soup = soup.find_all("div",
                                     class_=("content_title",
                                             "article_teaser_body"))

    #display(len(titles_body_soup))
    title_list = []
    news_list = []
    for i, x in enumerate(titles_body_soup):
        if i % 2 == 0:
            news_list.append(x.text)
        else:
            title_list.append(x.text)
    #display(len(title_list))
    #display(len(news_list))
    #news list has extra line due to div content_title from nav bar element
    news_list = news_list[1:49]
    #display(len(news_list))

    browser.visit(jpl_url)
    time.sleep(.5)
    browser.find_by_css("img.BaseImage").click()
    browser.find_by_css("svg.IconExpand").click()
    jpl_html = browser.html
    soup0 = BeautifulSoup(jpl_html, "html.parser")
    featured_image_jpg = soup0.find_all(
        "div", class_="BaseLightbox__slide__img")[0]("img")[0]["src"]

    browser.visit(mars_url)
    mars_table = pd.read_html(mars_url)
    planet_comparison_df = mars_table[1].set_index("Mars - Earth Comparison")
    #display(planet_comparison_df)
    mars_facts = mars_table[0].rename(columns=({
        0: "Description",
        1: "Mars"
    })).set_index("Description")
    #display(mars_facts)
    mars_html = mars_facts.to_html()

    hemisphere_dict_list = []
    hemisphere_images_urls = {}
    browser.visit(hemi_url)
    time.sleep(.5)
    for x in range(4):
        browser.find_by_css("img.thumb")[x].click()
        browser.find_by_css("a.open-toggle").click()
        large_hemi_html = browser.html
        hemi_soup = BeautifulSoup(large_hemi_html, "html.parser")
        title = hemi_soup("h2", class_="title")[0].text
        hemisphere_images_urls["title"] = title.replace(" Enhanced", "")
        hemisphere_images_urls["img_url"] = hemi_soup(
            "img", class_="wide-image")[0]["src"]
        hemisphere_dict_list.append(hemisphere_images_urls)
        browser.visit(hemi_url)
        hemisphere_images_urls = {}
    hemisphere_dict_list
    browser.quit()
    dict_data["article_title"] = title_list[0]
    dict_data["news_list"] = news_list[0]
    dict_data["featured_image"] = featured_image_jpg
    dict_data["mars_table"] = mars_html
    dict_data["hemisphere_dict_list"] = hemisphere_dict_list

    from pymongo import MongoClient
    mongo_conn = MongoClient('mongodb://localhost:27017')
    mars_db = mongo_conn["mars_db"]
    mars_coll = mars_db["mars"]
    mars_db.mars_coll.insert_one(dict_data)

    return dict_data
Ejemplo n.º 10
0
    #    hemispheres = {hemisphere_image_urls}
    #mars_data['hemisphere_image_urls'] = hemisphere_image_urls

    #return mars_data

    # 4. Print the list that holds the dictionary of each image url and title.
    hemisphere_image_urls


# Mongodb Helper Function
def scrape_hemisphere(html_text):
    hemisphere_soup = soup(html_text, "html.parser")
    try:
        title_element = hemisphere_soup.find("h2", class_="title").get_text()
        sample_element = hemisphere_soup.find("a", text="Sample").get("href")
    except AttributeError:
        title_element = None
        sample_element = None
    hemisphere = {"title": title_element, "img_url": sample_element}
    return hemisphere


# 5. Quit the browser
browser.quit()

# Tell Flask the script is complete and ready.
if __name__ == "__main__":
    # If running as script, print scraped data
    print(scrape_all())
Ejemplo n.º 11
0
def scrape():
    browser = init_browser()

    # Mars News
    # assign url; use browser to 'get' url
    url_news = "https://mars.nasa.gov/news/"
    browser.visit(url_news)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Use .find to locate the "div" and "class" and return only text
    news_title = soup.find('div', class_='content_title')

    news_p = soup.find('div', class_='article_teaser_body')

    # print text to confirm
    print(news_title.text)
    print(news_p.text)

    # Mars image
    # assign url; use browser to 'get' url
    url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url_image)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Use splinter to navigate the site and find the image url for the current Featured Mars Image
    image = soup.find('img', class_='fancybox-image')
    footer = soup.find("footer")
    link = footer.find('a')

    # assign the url string to a variable called `featured_image_url`.
    featured_image_url = link['data-fancybox-href']

    # save a complete url string for this image
    print('https://www.jpl.nasa.gov/' + featured_image_url)

    # Mars weather
    # assign url; use browser to 'get' url
    url_mars_weather = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_mars_weather)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # using regular expressions .compile function
    mars_weather = re.compile(r'sol')
    mars_weather = soup.find('span', text=mars_weather).text
    print(mars_weather)

    # Mars facts
    # assign url; use browser to 'get' url
    url_facts = "https://space-facts.com/mars/"
    browser.visit(url_facts)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # making a get request
    response = requests.get(url_facts)

    # scrape the table data from 'table' and 'id' elements
    mars_facts = soup.find('table', id="tablepress-p-mars-no-2").text
    print(mars_facts)

    # Mars table data
    table = pd.read_html(url_facts)
    table

    df = table[0]
    html_table = df.to_html()
    html_table

    df.to_html('table.html')

    # Mars Hemisphere
    # assign url; use browser to 'get' url
    url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_hemisphere)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # variable to locate the 'div' with 'item' from url
    items = soup.find_all('div', class_='item')

    # create an empty list to store results
    url_hemisphere_img = []

    # set variable to visit main url
    main_url = 'https://astrogeology.usgs.gov'

    # create a for loop
    for i in items:

        # locate titles
        title = i.find('h3').text

        # locate first partial img_url
        partial_img_url = i.find('a', class_='itemLink product-item')['href']

        # return to main_url; then partial img url
        browser.visit(main_url + partial_img_url)

        # initiate new html browser
        partial_img_html = browser.html

        # use beautiful soup and splinter to scrape each page
        soup = BeautifulSoup(partial_img_html, 'html.parser')

        # set variable to find full 'img' & 'src' urls
        img_url = main_url + soup.find('img', class_='wide-image')['src']

        # append titles and imgs; return as a list of dictionaries
        url_hemisphere_img.append({"title": title, "img_url": img_url})

    url_hemisphere_img

    # Close the browser after scraping
    browser.quit()

    # Store data in a dictionary
    mars_info = {
        "News Title": news_title,
        "News Paragraph": news_p,
        "Featured Image": featured_image_url,
        "Mars Weather": mars_weather,
        "Mars Facts": mars_facts,
        "Mars Table": table,
        "Mars Hemisphere": url_hemisphere_img
    }

    # Return results
    return mars_info
Ejemplo n.º 12
0
def scrape_all():

    browser = init_browser()

    browser.visit('https://mars.nasa.gov/news/')

    html = browser.html
    news_soup = BeautifulSoup(html, 'lxml')

    title = news_soup.find_all('div', class_='content_title')
    #place results in designated variables to be used later
    news_title = title[1].text.strip()
    print(news_title)

    parag = news_soup.find_all('div', class_='article_teaser_body')
    news_p = parag
    print(news_p)

    # JPL Mars Space Images - Featured Image

    browser.visit(
        "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")

    time.sleep(3)

    browser.click_link_by_partial_text('FULL IMAGE')

    browser.click_link_by_partial_text('more info')

    feat_html = browser.html
    feat_soup = BeautifulSoup(feat_html, 'html.parser')

    mars_img_url = feat_soup.find('figure', class_='lede').a['href']

    orig_url = "https://www.jpl.nasa.gov"
    featured_image_url = orig_url + mars_img_url
    print(f"{featured_image_url}")
    time.sleep(2)

    # Mars Facts

    mars_facts_url = 'https://space-facts.com/mars/'

    time.sleep(3)

    tables_found = pd.read_html(mars_facts_url)

    mars_facts_df = tables_found[0]
    mars_facts_df.head()

    #mars_html_table = mars_facts_df.to_html(classes='data table', index=False, header=False, border=0)
    mars_html_table = mars_facts_df.to_html()
    print(mars_html_table)

    # Mars Hemispheres

    #browser = Browser('chrome', **executable_path, headless=False)

    #hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(
        "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    )

    hemis_html = browser.html
    hemis_soup = BeautifulSoup(hemis_html, 'html.parser')

    hemis_orig_url = 'https://astrogeology.usgs.gov'

    hemisphere_urls = []

    hemis_items = hemis_soup.find_all('div', class_='item')

    # FOR loop to process titles and urls in a dictionary
    for item in hemis_items:

        title = item.find('h3').text
        partial_img_url = item.find('a',
                                    class_='itemLink product-item')['href']

        browser.visit(hemis_orig_url + partial_img_url)

        prev_html = browser.html
        hemis_soup = BeautifulSoup(prev_html, 'html.parser')

        img_url = hemis_orig_url + hemis_soup.find('img',
                                                   class_='wide-image')['src']

        hemisphere_urls.append({"title": title, "img_url": img_url})

        #print(f"{hemisphere_urls[item]}")

    # save all the compiled data about mars in a dictionary
    mars_dictionary = {
        "latest_news_title": news_title,
        "latest_news_parag": news_p,
        "JPL_featured_image": featured_image_url,
        "mars_facts_table": mars_html_table,
        "hemisphere_images": hemisphere_urls
    }
    #for debugging only
    # print("this is my mars dictionary")
    # print(f"[latest_news_title]")
    # print(f"[latest_news_parag]")
    # print(f"[JPL_featured_image]")
    # print(f"[mars_facts_table]")
    # print(f"[hemisphere_images]")

    # close browser
    browser.quit()

    return mars_dictionary