/
scrape_mars.py
105 lines (97 loc) · 3.79 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
def scrape():
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import requests
executable_path = {'executable_path': 'Resources/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
# 1.1 Scraping News Title and Paragraphs
url1 = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
browser.visit(url1)
news_title = []
news_para = []
for pages in range(10):
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
titles = soup.find_all(class_ = 'content_title')
paragraphs = soup.find_all(class_ = 'article_teaser_body')
for title in titles:
news_title.append(title.a.text)
for paragraph in paragraphs:
news_para.append(paragraph.text)
try:
browser.click_link_by_partial_text('MORE')
except:
print("Scraping Complete")
np_news_title = np.unique(np.array(news_title))
np_news_para = np.unique(np.array(news_para))
# 1.2 Get Images JPL Mars Space Images - Featured Image
url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url2)
featured_image_url = []
for pages in range(5):
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
url_imgs = soup.find_all(class_ = 'img')
for url_img in url_imgs:
image = url_img.img['src']
featured_image_url.append('https://www.jpl.nasa.gov' + image)
try:
browser.click_link_by_partial_text('Next')
except:
print("Scraping Complete")
np_featured_image_url = np.unique(np.array(featured_image_url))
# 1.3 Mars Weather
url3 = 'https://twitter.com/marswxreport?lang=en'
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'html.parser')
mars_weather = []
results = soup.find_all('div', class_="js-tweet-text-container")
for result in results:
try:
weather = result.p.text
mars_weather.append(weather)
except AttributeError as e:
print(e)
mars_weather = mars_weather[1]
# 1.4 Mars Facts
url4 = 'https://space-facts.com/mars/'
marsFacts = pd.read_html(url4)[0]
marsFacts.drop(columns = 'Earth', inplace = True)
marsFacts.columns = ['MarsFacts', 'Value']
marsFacts.head()
# 1.5 Mars Hemispheres
url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url5)
image_url = []
title = []
href_container = []
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
href_url_divs = soup.find_all('div', class_ = 'item')
for div in href_url_divs:
href_container.append('https://astrogeology.usgs.gov' + div.a['href'])
for links in href_container:
try:
browser.visit(links)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
browser.click_link_by_partial_text('Open')
img = soup.find('img', class_ = 'wide-image')
title = soup.find('h2', class_ = 'title')
print(img)
image_url.append({'title': title.text.replace(' Enhanced',''),'img_url' : 'https://astrogeology.usgs.gov/' + img['src']})
except:
print('scraping complete')
scrapped = {
'NewsTitle': np_news_title,
'NewsParagraps' : np_news_para,
'FeaturedImages' : np_featured_image_url,
'Facts': marsFacts,
'Weather': mars_weather,
'Hemispheres': image_url
}
return(scrapped)
scrape_value = scrape()