def get_answers_html(exam_title): logger.info(f"beging: get_answers_html,q = {exam_title}") html = '' browser = Browser(driver_name='chrome', executable_path='chromedriver.exe', headless=True) examtitle = exam_title # examtitle = 'POS机、银行卡概述' url = 'https://www.tiku88.com/' browser.visit(url) browser.fill('q', examtitle) time.sleep(2) browser.find_by_id('search_submit').click() time.sleep(2) html = browser.html while True: try: browser.find_link_by_partial_text('下一页').first.click() time.sleep(3) print("追加下一页的内容") html += browser.html except Exception as e: print("没有下一页") break browser.quit() return html
def scrape(): browser = Browser('chrome', **executable_path, headless=False) data = {} browser.visit('https://redplanetscience.com/') data['title'] = browser.find_by_css('div.content_title').text data['paragraph'] = browser.find_by_css('div.article_teaser_body').text browser.visit('https://spaceimages-mars.com') browser.find_link_by_partial_text('FULL IMAGE').click() data['image'] = browser.find_by_css('img.fancybox-image')['src'] data['table'] = pd.read_html('https://galaxyfacts-mars.com')[1].to_html() browser.visit('https://marshemispheres.com/') hemispheres = [] for i in range(4): hemisphere = {} hemisphere['title'] = browser.find_by_css('a.itemLink h3')[i].text browser.find_by_css('a.itemLink h3')[i].click() hemisphere['url'] = browser.find_by_text('Sample')['href'] browser.back() hemispheres.append(hemisphere) browser.quit() data['hemispheres'] = hemispheres return data
def scrape_5(): page = requests.get( 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' ) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find_all('div', class_='item') hemisphere_list = [] executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) for result in results: url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) try: h3 = result.h3.text browser.find_link_by_partial_text(h3).click() new_html = browser.html img_soup = BeautifulSoup(new_html, 'html.parser') img_link = img_soup.find("img", class_='wide-image')['src'] full_url = 'https://astrogeology.usgs.gov/' + img_link entry = {"text": h3, "url": full_url} hemisphere_list.append(dict(entry)) except: pass return hemisphere_list
def image_link(): path = {} executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) browser.find_link_by_partial_text('FULL').first.click() html = browser.html soup = bs(html, 'html.parser') image = soup.find('a',class_='button') link = image['data-link'] image_url = 'https://www.jpl.nasa.gov' + str(link) browser.visit(image_url) html2 = browser.html soup2 = bs(html2, 'html.parser') jpeg_image = soup2.find('figure',class_='lede') final_jpeg = jpeg_image.a['href'] featured_image_url = 'https://www.jpl.nasa.gov' + str(final_jpeg) path["src"] = featured_image_url return (path)
def featured_image(browser): executable_path = {"executable_path": "./chromedriver.exe"} browser = Browser("chrome", **executable_path) # Visit the NASA JPL (Jet Propulsion Laboratory) site url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) # Ask Splinter to go to site and click button with class name full_image # <button class="full_image">Full Image</button> full_image_button = browser.find_by_id("full_image") full_image_button.click() # Find "More Info" button and Click It browser.is_element_present_by_text("more info", wait_time=1) more_info_element = browser.find_link_by_partial_text("more info") more_info_element.click() # Parse results HTML with BeautifulSoup html = browser.html image_soup = BeautifulSoup(html, "html.parser") img = image_soup.select_one("figure.lede a img") try: img_url = img.get("src") except AttributeError: return None # Use base URL to create absolute URL img_url = f"https://www.jpl.nasa.gov{img_url}" return img_url
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit() def test_2_add_post(self): self.browser.visit("http://127.0.0.1:8080") print("current url = ", self.browser.url) self.browser.driver.set_window_size(1920, 1080) self.browser.click_link_by_text('login') print("current url = ", self.browser.url) self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() print(self.browser.url) add_link = self.browser.find_link_by_partial_text('add') add_link.click() print(self.browser.url) title = "test_acceptance_add_post" self.browser.fill("title", title) now = datetime.datetime.now() now = str(now) self.browser.fill("content", now) button = self.browser.find_by_css("button[type=submit]") button.click() print(self.browser.url) new_post_appears = self.browser.is_text_present( title) and self.browser.is_text_present(now) print("new_post_appears = ", new_post_appears) self.assertEqual(new_post_appears, True)
def Hemi_image(name): # Visit URL url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser = Browser("chrome", executable_path="chromedriver", headless=True) browser.visit(url) # Find and click the title link Hemi_image = browser.find_link_by_partial_text(name) Hemi_image.click() # Find the open button and click that Hemi_image1 = browser.find_by_id('wide-image-toggle') Hemi_image1.click() # Parse the resulting html with soup html = browser.html soup = BeautifulSoup(html, 'html.parser') # Find the relative image url try: # find the relative image url img_url_1 = soup.find('img', {"class": "wide-image"})['src'] except AttributeError: return None # Use the base URL to create an absolute URL img_url1 = f'https://astrogeology.usgs.gov{img_url_1}' return (img_url1)
def scrape_2(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.find_by_id('full_image').click() browser.find_link_by_partial_text('more info').click() new_html = browser.html img_soup = BeautifulSoup(new_html, 'html.parser') img_link = img_soup.find("figure", class_='lede').a['href'] recent_image = 'https://www.jpl.nasa.gov' + str(img_link) return recent_image
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit() def test_2_add_post (self): self.browser.visit("http://127.0.0.1:8080") print ("current url = ", self.browser.url) self.browser.driver.set_window_size(1920, 1080) self.browser.click_link_by_text('login') print ("current url = ", self.browser.url) self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() print (self.browser.url) add_link=self.browser.find_link_by_partial_text('add') add_link.click() print (self.browser.url) title="test_acceptance_add_post" self.browser.fill("title", title) now=datetime.datetime.now() now=str(now) self.browser.fill("content", now) button = self.browser.find_by_css("button[type=submit]") button.click() print(self.browser.url) new_post_appears=self.browser.is_text_present(title) and self.browser.is_text_present(now) print ("new_post_appears = ", new_post_appears) self.assertEqual(new_post_appears, True)
def scrape(): results={} executable_path = { 'executable_path': 'C:\p\HomeWork\Web-Scraping-Challenge\Mission_to_Mars\chromedriver.exe' } browser = Browser('chrome',**executable_path) url='https://mars.nasa.gov/news/' browser.visit(url) browser.is_element_present_by_css('ul.itme_list',wait_time=2) soup= BeautifulSoup(browser.html) title=soup.find('div','content_title').get_text() news_p=soup.find('div','article_teaser_body').get_text() results['news_title']=title results['news_paragraph']=news_p # 2. jpl.nasa.gov/spaceimages url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) full_image_btn=browser.find_by_id('full_image') full_image_btn.click() browser.is_element_present_by_text('more info', wait_time=1) more_info_btn = browser.find_link_by_partial_text('more info') more_info_btn.click() soup=BeautifulSoup(browser.html) img_url_rel=soup.select_one('figure.lede a img').get('src') img_url=f'http://www.jpl.nasa.gov{img_url_rel}' results['featured_images']= img_url #3 table from space-facts.com/mars/ df = pd.read_html('https://space-facts.com/mars/') [0] df.columns=['description','value'] df.set_index('description',inplace=True) results['facts']= df.to_html(classes='table table_striped') #4 hemisheres images from astrology.usgs.gov url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemisheres=[] links=browser.find_by_css('a.product-item h3') for i in range(len(links)): hemi={} browser.find_by_css('a.product-item h3')[i].click() sample_elm=browser.find_link_by_text('Sample').first img_url=sample_elm['href'] title=browser.find_by_css('h2.title').text hemi['title']=title hemi['img_url']=img_url hemisheres.append(hemi) browser.back() results['hemispheres']=hemisheres return(results)
def retrieve_hemispheres(): browser = Browser("chrome",headless=True) browser.visit(source_urls['hemispheres']) browser.click_link_by_partial_text('Enhanced') browser.click_link_by_partial_text('Back') hemisphere_links = browser.find_link_by_partial_text('Hemisphere') link_text = [] for link in hemisphere_links: link_text.append(link.text) hemisphere_image_urls = [] for link in link_text: browser.click_link_by_partial_text(link) hemisphere_image_urls.append({ 'title' : link[:-9], 'tif_url' : browser.find_link_by_partial_text('Original')['href'], 'jpg_url' : browser.find_link_by_text('Sample')['href'], }) browser.click_link_by_partial_text('Back') return hemisphere_image_urls
def scrape(): browser = init_browser() mars = {} #marsimage url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') image = browser.find_by_id('full_image') image.click() browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() html = browser.html soup = bs(html, 'html.parser') image_find = soup.find('figure', class_='lede').find('img')['src'] featured_image_url = f"https://www.jpl.nasa.gov/{image_find}" mars["featured_image_url"] = featured_image_url # Mars Weather url2 = 'https://twitter.com/marswxreport?lang=en' browser.visit(url2) html = browser.html soup = bs(html, 'html.parser') mydivs = soup.findAll("p", { "class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" }) mars_weather = mydivs[0].text mars['mars_weather'] = mars_weather # Mars Facts url3 = 'https://space-facts.com/mars/' tables = pd.read_html(url3) df = tables[0] df.columns = ['Details', 'Values'] df.set_index('Details', inplace=True) # html_table = df.to_html() # html_table.replace('\n', '') mars['mars_facts'] = df return mars
class Submitter: def __init__(self, url, username, password, course_id, homework_id, submit_list): self._callback = None self._browser = Browser() self._url = url self._username = username self._password = password self._course_id = course_id self._homework_id = homework_id self._submit_list = submit_list def _login(self): self._browser.visit(self._url) self._browser.fill("i_user", self._username) self._browser.fill("i_pass", self._password) self._browser.find_by_id("loginButtonId").click() def _nvi2course(self): self._browser.find_link_by_partial_text(self._course_id).first.click() self._browser.windows.current.close() def _nvi2homework(self): self._browser.find_link_by_partial_text("课程作业").first.click() self._browser.find_link_by_partial_text( self._homework_id).first.click() def _submit(self, stu_id, grade, comment, ex_file): xpath_str = '//tbody/tr[td[3]=' + stu_id + ']/td[last()]/a' self._browser.find_by_xpath(xpath_str).last.click() self._browser.fill('cj', grade) self._browser.fill('pynr', comment) if os.path.splitext(ex_file)[1] == '.pdf': self._browser.driver.find_element_by_name('fileupload').send_keys( ex_file) submit_btn_css = 'div[class="sub-back sub-back-3 absolute"] > input[class="btn"]' self._browser.find_by_css(submit_btn_css).first.click() while not self._browser.is_text_present('关闭', wait_time=1): pass self._browser.find_by_text('关闭').click() self._browser.back() self._browser.back() def add_single_task_callback(self, callback): self._callback = callback def start(self): self._login() self._nvi2course() self._nvi2homework() for stu_id, grade, comment, ex_file in self._submit_list: self._submit(stu_id, grade, comment, ex_file) self._callback([stu_id, grade, comment, ex_file]) self._browser.quit() @staticmethod def clean(): work_dir = os.getcwd() os.remove(work_dir + "/geckodriver.log")
def mars_hemi(): browser = Browser("chrome", executable_path="chromedriver", headless=True) # Visit the mars hemisphere site url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Set up HTML parser html = browser.html hem_soup = BeautifulSoup(html, 'html.parser') # Find all h3 headings hem_title = hem_soup.find_all('h3') hem_title # Put titles in a list hem_list = [] for title in hem_title: word = title.text hem_list.append(word) url_list = [] for index in [0,1,2,3]: # Visit the mars hemisphere site url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Find the title button and click that more_info_elem = browser.find_link_by_partial_text(hem_list[index]) more_info_elem.click() # Parse the resulting html with soup html = browser.html img_soup = BeautifulSoup(html, 'html.parser') # Find the relative image url hem_url_rel = img_soup.find('img', class_ = "wide-image").get("src") # Use the base URL to create an absolute URL hem_url = f'https://astrogeology.usgs.gov/{hem_url_rel}' url_list.append(hem_url) return hem_list, url_list
def JPL_image(): browser = init_browser() executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) #for button in buttons: browser.find_link_by_partial_text('FULL IMAGE').click() browser.is_element_not_present_by_id('images', wait_time=2) browser.find_link_by_partial_text('more info').click() link = browser.find_link_by_partial_href('largesize') image_url = link.html.split("=") image_url = link.html.split("=")[-1].lstrip('"') image_url = image_url.rstrip('">') featured_image_url = 'https://www.jpl.nasa.gov' + image_url return featured_image_url
def get_featured_img_func(url): # Path to chromedriver executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Go to website browser.visit(url) # find "Full Image" button to click on it to get to next webpage full_img = browser.find_by_id("full_image") full_img.click() # find "More Info" button to click on it to get to next webpage browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() # read website's html html = browser.html soup = bs(html, 'html.parser') # find "a" tag to find href containing the URL result = browser.find_by_tag("a") relative_image_path = result[58]["href"] # get image title relative_image_title = soup.find('h1', class_='article_title') relative_image_title = relative_image_title.get_text() relative_image_title = relative_image_title.split('\t') relative_image_title relative_image_title[4] final_title_feature_img = [] final_title_feature_img.append({ 'Title': relative_image_title[4], 'URL': relative_image_path }) # Close the browser after scraping browser.quit() #return scraped object return final_title_feature_img
def featured_img(): executable_path = {"executable_path": (r"C:\Users\Mickey\anaconda3\Scripts\chromedriver.exe")} browser = Browser("chrome", **executable_path, headless=False) #Browse URL url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) time.sleep(1) #Ask Splinter to Go to Site and Click Button with Class Name full_image # <button class="full_image">Full Image</button> full_image_button = browser.find_by_id("full_image") full_image_button.click() #Find "More Info" Button and Click It browser.is_element_present_by_text("more info", wait_time=1) more_info_element = browser.find_link_by_partial_text("more info") more_info_element.click() #Parse Results with BeautifulSoup (Allow for Try and Except) html = browser.html image_soup = bs(html, "html.parser") img = image_soup.select_one("figure.lede a img") try: img_url = img.get("src") except AttributeError: return None #Combine with Base URL img_url = f"https://www.jpl.nasa.gov{img_url}" #Close Broswer when done browser.quit() #Return Results return img_url
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # URL of NASA Mars News Site to be scraped url = 'https://mars.nasa.gov/news/' # In[14]: # Retrieve page with the requests module browser.visit(url) # In[16]: browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # In[17]: html = browser.html # In[19]: # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(html, 'html.parser') # In[20]: # Examine the results, then determine element that contains sought info print(soup.prettify()) # In[23]: element = soup.select_one("ul.item_list li.slide") title = element.find("div", class_="content_title").get_text() title # In[24]: paragraph = element.find("div", class_='article_teaser_body').get_text() paragraph # In[21]: # # Extract title text # news_title = soup.find('div', class_='content_title').text # news_p = soup.find('div', class_='article_teaser_body') # print(news_title) # print(news_p) # ## JPL Mars Space Images - Featured Image # In[26]: executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # In[27]: url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # ### featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg' # In[36]: element = browser.find_by_id("full_image") element.click() # In[37]: browser.is_element_present_by_css("more info", wait_time=1) findElement = browser.find_link_by_partial_text("more info") findElement.click() # In[38]: html = browser.html imagesoup = BeautifulSoup(html, 'html.parser') # In[39]: image = imagesoup.select_one("figure.lede a img") imagesource = image.get("src") # In[41]: featured_image_url = 'https://www.jpl.nasa.gov' + imagesource featured_image_url # ## Mars Facts # In[64]: url = 'https://space-facts.com/mars/' tables = pd.read_html(url) tables fact_df = tables[0] fact_df.columns = ["Description", "value"] fact_df.set_index("Description", inplace=True) # In[67]: html_table = fact_df.to_html(classes="table table-striped") hemurl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' response = requests.get(hemurl) hemsoup = BeautifulSoup(response.text, "html.parser") item = hemsoup.find_all(class_="itemLink product-item") item # In[92]: hem = [] for image in item: image_url = "https://astrogeology.usgs.gov" + image.get("href") hem.append(image_url) hem # In[ ]: hemurl = [] for url in hem: response = requests.get(url) imgsoup = BeautifulSoup(response.text, "html.parser") #time.sleep(2) imageurl = imgsoup.find("a", href=True, text="Sample") href = imageurl["href"] title = imgsoup.find(class_="title").text.strip().replace( ' Enhanced', '') hemurl.append({"title": title, "img_url": href}) hemurl # In[ ]: data = { "news_title": title, "news_p": paragraph, "image": featured_image_url, "mars_df": html_table, "mars_hem": hemurl } return data
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) nasaURL = 'https://mars.nasa.gov/news/' browser.visit(nasaURL) time.sleep(2) soupNasa = bs(browser.html, 'html.parser') news_title = soupNasa.find('ul', class_='item_list').find( 'div', class_="content_title").find('a').text.strip() news_p = soupNasa.find('div', class_="article_teaser_body").text.strip() news_date = soupNasa.find('div', class_="list_date").text.strip() jplURL = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jplURL) time.sleep(2) soupJPL = bs(browser.html, 'html.parser') browser.find_by_id("full_image").click() browser.find_link_by_partial_text("more info").click() soupJPL = bs(browser.html, 'html.parser') featured_image_url = 'https://www.jpl.nasa.gov' + soupJPL.find( 'figure').find('img')['src'] twitterURL = 'https://twitter.com/marswxreport?lang=en' browser.visit(twitterURL) time.sleep(2) soupTwitter = bs(browser.html, 'html.parser') mars_weather = soupTwitter.find("div", { "data-testid": "tweet" }).find('div', { "lang": "en" }).find('span').text factsURL = 'https://space-facts.com/mars/' tables = pd.read_html(factsURL)[0] tables.set_index(0, inplace=True) marsFacts = tables.to_html(header=False) hemisphereURL = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' hemisphereBaseURL = 'https://astrogeology.usgs.gov' browser.visit(hemisphereURL) time.sleep(2) soupHemi = bs(browser.html, 'html.parser') hemisphere_image_urls = [] hemis = soupHemi.find_all('div', class_="item") for h in hemis: tempURL = hemisphereBaseURL + h.find('a')['href'] browser.visit(tempURL) time.sleep(2) soupSearch = bs(browser.html, 'html.parser') downloads = soupSearch.find("div", class_="downloads").find_all("li") for dl in downloads: if (dl.find('a').text == "Sample"): temp_dict = { "title": soupSearch.find("h2", class_="title").text, "img_url": dl.find('a')['href'] } hemisphere_image_urls.append(temp_dict) returnDictionary = { "news_title": news_title, "news_p": news_p, "news_date": news_date, "featured_image_url": featured_image_url, "mars_weather": mars_weather, "mars_facts_table": marsFacts, "hemisphere_image_urls": hemisphere_image_urls } browser.quit() return returnDictionary
url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url_image) # In[73]: html_i = browser.html soup_i = BeautifulSoup(html_i, 'html.parser') # In[74]: image_button = browser.find_by_id("full_image").click() next_link = browser.find_link_by_partial_text("more info").click() # In[89]: results_i = soup.find_all('img') # In[90]: results_i # In[91]: image_i_big = [] for result_i in results_i: big_image = result_i['src'] image_i_big.append(big_image)
def scrape_info(): browser = Browser("chrome") mars = {} url = 'https://mars.nasa.gov/news/' browser.visit(url) # Parse HTML with Beautiful Soup html = browser.html soup = BeautifulSoup(html, 'html.parser') # Find all the content title and paragraph news_title = soup.find_all('div', class_='content_title') news_p = soup.find_all('div', class_='article_teaser_body') print(news_title) print(news_p) # A blank list to hold the headlines " we are trying to display all titles " news_titles = [] # Loop over div elements for result in news_title: # Identify the anchor... if (result.a): # And the anchor has non-blank text... if (result.a.text): # Append thext to the list news_titles.append(result) news_titles # A blank list to hold the paragraphs " we are trying to display all paragraph " news_para = [] # Loop over div elements for result in news_p: # Identify the anchor... if (result.text): # Append thext to the list news_para.append(result) news_para #Top 5 Titles top_titles = [] # Print only the headlines for x in range(5): temp=news_titles[x].text newvar = temp.strip('\n\n') top_titles.append(newvar) mars["news_title"] =top_titles[0] #Top 5 Paragraph top_paragraph = [] # Print only the headlines for x in range(5): temp=news_para[x].text newvar = temp.strip('\n\n') top_paragraph.append(newvar) top_paragraph mars["news_paragraph"] =top_paragraph[0] # URL of page to be scraped url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(2) browser.find_by_id('full_image').click() time.sleep(2) browser.find_link_by_partial_text('more info').click() # Parse HTML with Beautiful Soup html = browser.html soup = BeautifulSoup(html, 'html.parser') featured_image_url = soup.find('figure', class_="lede") featured_image_url = featured_image_url.a.img["src"] featured_image_url main_url = 'https://www.jpl.nasa.gov' featured_image_url = main_url + featured_image_url featured_image_url mars["featured_image"] =featured_image_url ### Mars Facts # define url mars_facts_url = "https://space-facts.com/mars/" # read html into pandas tables = pd.read_html(mars_facts_url) # It returns 3 tables. The first has the data needed, so will convert to a dataframe and clean up nameing facts_mars = tables[0] facts_mars.columns = ["Description", "Value"] facts_mars #setting index facts_mars.set_index('Description', inplace=True) facts_mars.head() #Use Pandas to convert the data to a HTML table string. html_table = facts_mars.to_html() html_table mars["facts"] =html_table # define url and open in browser mars_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(mars_url) # Parse HTML with Beautiful Soup html = browser.html soup = BeautifulSoup(html, 'html.parser') #finding titles of hemispheres hemisphere_titles = soup.find_all('h3') hemisphere_titles #Use a Python dictionary to store the data using the keys `img_url` and `title`. hemisphere_image_urls = [] #going through each title, clicking it opening the wide image coping url printing as we go along and putting in dictionary for i in range(len(hemisphere_titles)): hemisphere_title = hemisphere_titles[i].text print(hemisphere_title) hemisphere_images = browser.find_by_tag('h3') hemisphere_images[i].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') img_url = soup.find('img', class_='wide-image')['src'] img_url = "https://astrogeology.usgs.gov" + img_url print(img_url) hemisphere_dict = {"title": hemisphere_title, "img_url":img_url} hemisphere_image_urls.append(hemisphere_dict) browser.back() #printing dictionary hemisphere_image_urls mars["hemispheres"] =hemisphere_image_urls return mars
def scrape_all(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url ='https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') dic={} #Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title #and Paragraph Text. Assign the text to variables that you can reference later. html = browser.html soup = BeautifulSoup(html, 'html.parser') title= soup.find_all('div', class_='content_title') body= soup.find('div', class_='article_teaser_body') print(title[1].text) print(body.text) news_title=title[1].text news_p= body.text dic[news_title]=news_title dic[news_p]=news_p dic #browser.quit() # JPL Mars Space Images - Featured Image image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(image_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.find_by_id('full_image').click() browser.find_link_by_partial_text('more info').click() html = browser.html soup = BeautifulSoup(html, 'html.parser') imgs=soup.find('figure', class_='lede') print(imgs) print(imgs.a) print(imgs.a.img) print(imgs.a.img['src']) featured_image_url='https://www.jpl.nasa.gov'+imgs.a.img['src'] print(featured_image_url) dic[featured_image_url]=featured_image_url dic # Mars Facts url ='https://space-facts.com/mars/' facts=pd.read_html(url) facts type(facts) df=facts[0] df.columns=['Profile','Values'] df.set_index('Profile', inplace=True) df.head() html_facts = df.to_html() html_facts #strip unwanted newlines to clean up the table. html_facts.replace('\n', '') df.to_html('facts.html') #Mars Hemispheres executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) image_urls= [] imgs = browser.find_by_css("a.product-item h3") imgs # For loop for i in range(len(imgs)): hemisphere = {} browser.find_by_css("a.product-item h3")[i].click() # Find Sample Image sample_element = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] # Get the Title hemisphere["title"] = browser.find_by_css("h2.title").text # Append image_urls.append(hemisphere) # find imgs back browser.back() image_urls dic['hemisphere']=image_urls dic return dic
def scrape_all(): # Create the exe path for chrome to open chrome page # Will open a chrome window executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=True) print( f'\n-------------------------------------------------------------------------------------\nScraping Started' ) print( f'\n-------------------------------------------------------------------------------------\n' ) # Visit the site to scrape # Will go to the website and extract the browser url news_url = "https://mars.nasa.gov/news/" browser.visit(news_url) browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup news_html = browser.html soup = bs(news_html, 'lxml') news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='article_teaser_body').text print( f'\n-------------------------------------------------------------------------------------\n' ) print(f'\nNews Title: {news_title}') print(f'\nNews Para: {news_p}') print( f'\n-------------------------------------------------------------------------------------\n' ) #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Visit the site to scrape # Will go to the website and extract the browser url jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(jpl_url) time.sleep(1) # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup jpl_html = browser.html soup = bs(jpl_html, 'lxml') #print(soup.prettify()) image_link = soup.find( 'div', class_='carousel_container').article.footer.a['data-fancybox-href'] featured_image_url_medium = f'https://www.jpl.nasa.gov{image_link}' time.sleep(1) full_image_elem = browser.find_by_id("full_image") full_image_elem.click() time.sleep(1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() html = browser.html img_soup = bs(html, 'lxml') img_url_rel = img_soup.select_one('figure.lede a img').get("src") featured_image_url_large = f'https://www.jpl.nasa.gov{img_url_rel}' print(f'Featured Image: {featured_image_url_large}') print( f'\n-------------------------------------------------------------------------------------\n' ) #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Visit the site to scrape # Will go to the website and extract the browser url weather_url = "https://twitter.com/marswxreport?lang=en" browser.visit(weather_url) time.sleep(1) # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup weather_html = browser.html soup = bs(weather_html, 'lxml') weather_all = soup.find_all('div', class_='js-tweet-text-container') weather_list = [] for x in weather_all: y = x.find('p', class_='js-tweet-text').text if "InSight" in y: weather_list.append(y) mars_weather = weather_list[0] print(f'Mars Weather: {mars_weather}') print( f'\n-------------------------------------------------------------------------------------\n' ) #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Visit the site to scrape # Will go to the website and extract the browser url facts_url = "https://space-facts.com/mars/" browser.visit(facts_url) # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup facts_html = browser.html soup = bs(facts_html, 'lxml') facts_str = pd.read_html(facts_url) # https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.to_html.html facts_html = facts_str[1].to_html(index=False, header=False) #facts_str[1].to_html("facts.html", index = False, header = False) #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Visit the site to scrape # Will go to the website and extract the browser url hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemisphere_url) # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup hemisphere_html = browser.html soup = bs(hemisphere_html, 'lxml') # Find the links image_urls = [(a.text, a['href']) for a in browser.find_by_css('div[class="description"] a')] hemisphere_image_urls = [] for title, url in image_urls: temp = {} temp['title'] = title browser.visit(url) img_url = browser.find_by_css('img[class="wide-image"]')['src'] temp['img_url'] = img_url hemisphere_image_urls.append(temp) print(f'Dict: {hemisphere_image_urls}') print( f'\n-------------------------------------------------------------------------------------\n' ) #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- data = { "news_title": news_title, 'news_paragraph': news_p, "featured_image": featured_image_url_large, "weather": mars_weather, "facts_html": facts_html, "hemisphere_image_urls": hemisphere_image_urls } return data #test = scrape_all()
# In[69]: #visiting the page url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) # In[70]: time.sleep(2) image_button = browser.find_by_id("full_image").click() # In[71]: time.sleep(2) more_info_button = browser.find_link_by_partial_text("more info").click() # In[72]: image = bs(browser.html, "html.parser") # In[75]: image_url = image.find("figure", class_="lede").find("img")["src"] image_url # In[77]: final_url = "https://www.jpl.nasa.gov" + image_url final_url
if JANUS: url = 'http://apps.webofknowledge.com.dianus.libr.tue.nl/DIIDW_AdvancedSearch_input.do?' \ 'SID=V2i7L6wGDEBBsnkAWFI&product=DIIDW&search_mode=AdvancedSearch' browser.visit(url) #this redirects to janus, fill in login info browser.fill('user',USERNAME) browser.fill('pass', PASSWORD) #find and click the login button browser.find_by_value('Login').first.click() else: url = 'http://apps.webofknowledge.com/DIIDW_AdvancedSearch_input.do?SID=N1cpglrQOdCmC16gM44&product=DIIDW&search_mode=AdvancedSearch' browser.visit(url) #if new session needs to be started click link try: browser.find_link_by_partial_text('new session').first.click() except: pass def Build_Query_Citations(codes): #iterate through the list #build the query query = "CD=(" for code in codes: if query == "CD=(": query += code else: query += " OR " + code query += ")" return query
def scrape_info(): #splinter exercise executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) mars= {} #scrape website url= 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(browser.html, 'html.parser') #pulling title news_title= soup.find_all('div', class_='content_title') news_title= news_title[1].a.text #print(news_title) mars["news_title"]=news_title news_p= soup.find_all('div', class_='article_teaser_body') news_p= news_p[0].text mars["news_p"]= news_p soup # *Splinter* url= 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(2) browser.find_by_id("full_image").click() time.sleep(2) browser.find_link_by_partial_text("more info").click() time.sleep(2) soup= BeautifulSoup(browser.html, 'html.parser') result= soup.find("figure", class_= "lede") result= result.a.img["src"] result featured_image_link= "https://www.jpl.nasa.gov" + result mars["featured_image_link"]= featured_image_link # FEATURED IMAGE # MARS FACTS url= 'https://space-facts.com/mars/' table= pd.read_html(url) table[0] df= table[0] df.columns= ["Description", "Values"] df.set_index("Description", inplace= True) df html_table= df.to_html() html_table= html_table.replace('\n', '') mars["Facts"]= html_table mars # Mars Hemispheres url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) hemisphere_image_url= [] for i in range (4): hemispheres= {} time.sleep(2) browser.find_by_css("a.product-item h3")[i].click() soup= BeautifulSoup(browser.html, 'html.parser') title= soup.find("h2", class_= "title").get_text() image= soup.find("a", text= "Sample").get("href") hemispheres["title"]= title hemispheres["img_url"]= image hemisphere_image_url.append(hemispheres) browser.back() mars["hemispheres"]= hemisphere_image_url mars= { "news_title": news_title, "news_p": news_p, "featured_image_link": featured_image_link, "Facts": html_table, "hemisphere_image_title_1": hemisphere_image_url[0]["title"], "hemisphere_image_url_1": hemisphere_image_url[0]["img_url"], "hemisphere_image_title_2": hemisphere_image_url[1]["title"], "hemisphere_image_url_2": hemisphere_image_url[1]["img_url"], "hemisphere_image_title_3": hemisphere_image_url[2]["title"], "hemisphere_image_url_3": hemisphere_image_url[2]["img_url"], "hemisphere_image_title_4": hemisphere_image_url[3]["title"], "hemisphere_image_url_4": hemisphere_image_url[3]["img_url"] } return mars
class MegBotMentions: """MegBotMentions is an interface to Facebook Messages. You can use it by logging into your account, and giving it a group message you are a member of. You can navigate to different pages of the message, read the page, and write to the group chat. Please make sure the other members are aware MegBotMentions is joining y'all in your conversation.""" def __init__(self, un, pw): self.username = un; self.password = pw; self.currentPage = 0; self.messageID = 0; self.browser = Browser("phantomjs"); pass; def login(self): self.browser.visit(_base_url) self.browser.fill('email', self.username); self.browser.fill('pass', self.password); self.browser.find_by_css('input[type="submit"]').first.click(); print "Logged in!" if self.messageID: self.moveToMessage(); def move_to_message(self, mID): self.currentPage = 0; self.messageID = mID.strip(); self.browser.visit(_base_msg_url + self.messageID); print _base_msg_url + self.messageID; time.sleep(3); #self.send_message(_intro_message); def refresh_messages(self): if self.messageID == 0: return; self.currentPage = 0; self.browser.visit(_base_msg_url + self.messageID); time.sleep(3); def next_page(self): if self.messageID == 0: return False; self.currentPage += 1; print self.currentPage; self.browser.visit(_base_msg_url + self.messageID + _msg_url_mid + str(5 * self.currentPage)); time.sleep(3); def send_message(self, inWords, receiver): #navigate back to page self.browser.visit("https://mbasic.facebook.com/messages/"); #print self.browser.current_url; newconvo = self.browser.find_link_by_partial_text(receiver.title()).first; newconvo.click(); #go through all names #names = self.browser.find_by_css('.bp'); #for name in names: #nametext = name.find_by_css('a'); #if nametext == receiver: #click on name #nametext.click_link_by_href(); #print self.browser.current_url; self.browser.find_by_id('composerInput')[0].fill("Hey, " + receiver + ", you've been mentioned in a chat"); self.browser.find_by_css('input[name="send"]').first.click(); time.sleep(3); #execute send message def read_messages(self): messages = []; mHTML = self.browser.find_by_id("messageGroup").find_by_css("div"); #unnecessary right now link = mHTML.pop(0).find_by_css("a")["href"]; lines = mHTML.find_by_css("span"); #lines = mHTML.find_by_tag('span'); for line in lines: l = filter(lambda x: x in string.printable, line.text.strip()); if l and l != "." and not ("Sent from" in l) and not ("Seen by" in l): messages.insert(0, l); #else: #print "skipped " + l; return messages;
from conn_info import * parser = OptionParser() parser.add_option("-d", "--dept", dest="dept_index", default=0, help="indice du departement") options, args = parser.parse_args() dept_index = int(options.dept_index) t0 = time.time() browser = Browser('zope.testbrowser') browser.visit(SCODOC) print 'Start: title:', browser.title print 'URL: ', browser.url # print browser.html links = browser.find_link_by_partial_text('Scolarit') print '%d departements' % len(links) links[dept_index].click() # va sur le premier departement # ---- Formulaire authentification print 'Authentification: ', browser.url browser.fill('__ac_name', USER) browser.fill('__ac_password', PASSWD) button = browser.find_by_id('submit') button[0].click() # ---- Page accueil Dept print browser.url
class KuchIterator: def __init__(self): self.browser = Browser('phantomjs') self.browser.visit(PAGE_URL) self.get_next_row() def __iter__(self): return self def get_next_row(self): soup = bs4.BeautifulSoup(self.browser.html) self.schedule = soup.find_all('table')[0] self.cur_row = self.schedule.tbody.tr def get_a_children(self, parent): out = "" for c1 in parent.children: if getattr(c1, 'name', None): if c1.name == 'a': if c1.string: out += c1.string elif c1.name in ['i', 'br']: out += ", " out += self.get_a_children(c1) else: if c1.string: out += c1.string return out def next(self): try: c1 = self.cur_row.td except AttributeError: self.browser.find_link_by_partial_text('next').click() time.sleep(5) self.get_next_row() c1 = self.cur_row.td when1 = c1.span['content'] c1 = c1.next_sibling who1 = c1.next_sibling.div.ul.li.contents[0] if not isinstance(who1, bs4.element.NavigableString): who1 = who1.contents[0] c1 = c1.next_sibling c1 = c1.next_sibling c1 = c1.next_sibling c1 = c1.next_sibling where1 = c1.next_sibling.a.contents[0] self.cur_row = self.cur_row.next_sibling self.cur_row = self.cur_row.next_sibling pos1 = when1.find('+') if pos1 != -1: when1 = when1[:pos1] date1 = datetime.datetime.strptime(when1, '%Y-%m-%dT%H:%M:%S') dict1 = {} dict1['year'] = date1.year dict1['month'] = date1.month dict1['day'] = date1.day dict1['hour'] = date1.hour dict1['min'] = date1.minute dict1['what'] = who1 dict1['where'] = where1 dict1['content'] = 'musicrux' return dict1
def scrape_info(): #run ChromeDriverManager executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news' browser.visit(url) mars = {} # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = bs(html, 'html.parser') type(soup) # Retrieve all elements that contain mars article information result1 = soup.find_all('div', class_='content_title') result2 = soup.find('div', class_='article_teaser_body') mars_article = result1[1].text.strip() mars_body_text = result2.text.strip() print(mars_article) print('-----------') print(mars_body_text) mars["news_title"] = mars_article mars["news_p"] = mars_body_text #visit the url for JPL Featured Space Image url_2 = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html' browser.visit(url_2) time.sleep(.5) browser.find_link_by_partial_text('FULL IMAGE').click() # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = bs(html, 'html.parser') type(soup) # Retrieve all elements that contain image information mars_image = soup.find('img', class_='headerimage fade-in') print('https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/') print(mars_image) print('-----------') mars_image["src"] featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + mars_image[ "src"] mars["featured_image_url"] = featured_image_url url_3 = 'https://space-facts.com/mars/' mars_tables = pd.read_html(url_3) mars_tables type(mars_tables) mars_df = mars_tables[0] mars_df mars2_df = mars_df.set_index(0, inplace=True) mars_df mars3_df = mars_df.rename(columns={0: ' ', 1: ' '}) mars3_df mars_html_table = mars3_df.to_html() mars_html_table clean_mars_html_table = mars_html_table.replace('\n', '') clean_mars_html_table mars["facts"] = clean_mars_html_table url_4 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_4) link_list = browser.find_by_css("a.product-item h3") mars_hemisphere_image_urls = [] for x in range(len(link_list)): hemisphere = {} browser.find_by_css("a.product-item h3")[x].click() sample = browser.links.find_by_text("Sample").first hemisphere["img_url"] = sample["href"] hemisphere["title"] = browser.find_by_css("h2.title").text mars_hemisphere_image_urls.append(hemisphere) browser.back() mars_hemisphere_image_urls mars["hemisphere"] = mars_hemisphere_image_urls browser.quit() # Return results return mars
def mars_scrape(): mars = {} executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=True) url = 'https://mars.nasa.gov/news/' browser.visit(url) sleep(1) ourwebpage = browser.html soup = bs(ourwebpage, 'html.parser') x = soup.body.find_all(class_="content_title") alltitle = [] for i in x[1:]: alltitle.append(i.find('a').text.strip()) alltitle = alltitle[0] mars['title'] = alltitle paragraph = soup.body.find_all(class_="article_teaser_body") news_p = [] for i in paragraph: #print(i.text) news_p.append(i.text) news_p = news_p[0] mars['news_paragraph'] = news_p url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) sleep(1) browser.click_link_by_id('full_image') z = browser.find_link_by_partial_text("more info") z.click() sleep(1) imgwebpage = browser.html soup2 = bs(imgwebpage, 'html.parser') image_path = soup2.find(class_="main_image")['src'] image_full_path = "https://www.jpl.nasa.gov" + image_path mars["feature_img"] = image_full_path mars_table = pd.read_html("https://space-facts.com/mars/")[0] mars_table.rename(columns={0: "Category", 1: "Value"}, inplace=True) mars["mars_table"] = mars_table url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) sleep(1) image = [] for i in range(4): browser.find_by_css("a.product-item h3")[i].click() sleep(1) html_image = browser.html soupitem = bs(html_image, 'html.parser') zz = soupitem.find('a', text="Sample") image.append(zz['href']) browser.back() hemisphere_image_urls = [ { "title": "Valles Marineris Hemisphere", "img_url": image[0] }, { "title": "Cerberus Hemisphere", "img_url": image[1] }, { "title": "Schiaparelli Hemisphere", "img_url": image[2] }, { "title": "Syrtis Major Hemisphere", "img_url": image[3] }, ] mars["mars_image"] = hemisphere_image_urls return mars
# ""### Featured Images" # In[9]: # Visit URL url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # In[10]: # Find and click the full image button full_image_elem = browser.find_by_id('full_image') full_image_elem.click() # In[11]: # Find the more info button and click that browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() # In[12]: # Parse the resulting html with soup html = browser.html img_soup = BeautifulSoup(html, 'html.parser') # In[13]: browser.is_element_present_by_text('main_image', wait_time=1) # Find the relative image url img_url_rel = img_soup.select_one('figure.lede a img').get("src") img_url_rel # In[14]: # Use the base URL to create an absolute URL
def scrape(): # Dependencies import time import requests import pandas as pd from bs4 import BeautifulSoup from splinter import Browser # from selenium.webdriver.common import action_chains, keys # from selenium import webdriver import pymongo conn = "mongodb://localhost:27017" client = pymongo.MongoClient(conn) db = client.mars_db mars_data = db.mars_data db.mars_data.drop() # having issues with browser, use webdriver instead #driver = webdriver.Chrome() #url = 'https://mars.nasa.gov/news/' #driver.get(url) #html = driver.page_source #soup = BeautifulSoup(html, 'lxml') # In[35]: browser = Browser('chrome', headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(5) # In[36]: html = browser.html soup = BeautifulSoup(html, 'lxml') print(soup.prettify()) # In[37]: # soup.body.prettify() # In[38]: # Extract news title text title = soup.find('div', class_='bottom_gradient').text print(title) # In[39]: # Extract paragraph text paragraph = soup.find('div', class_='rollover_description_inner').text print(paragraph) # ### JPL Mars Space Images - Featured Image # In[7]: # Visit the url for JPL's Featured Space Image here. # Use splinter to navigate the site and find the image url for # the current Featured Mars Image and assign the url string to a variable called featured_image_url. # Make sure to find the image url to the full size .jpg image. # Make sure to save a complete url string for this image. # # Example: # featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg' # In[11]: from splinter import Browser #img_url = 'https://www.jpl.nasa.gov/spaceimages/' #executable_path = {'executable_path': './chromedriver'} #browser = Browser('chrome', **executable_path) #browser.visit(img_url) browser = Browser('chrome', headless=False) img_url = 'https://www.jpl.nasa.gov/spaceimages/' browser.visit(img_url) time.sleep(5) # In[12]: browser.click_link_by_id('full_image') # In[13]: time.sleep(5) browser.find_link_by_partial_text('more info').click() # In[14]: #time.sleep(5) #browser.find_link_by_partial_text('.jpg').click() # In[15]: time.sleep(5) featured_image_url = browser.find_by_tag('img')[6]['src'] featured_image_url # ### Mars Weather # In[16]: # Visit the Mars Weather twitter account here # and scrape the latest Mars weather tweet from the page. # Save the tweet text for the weather report # as a variable called mars_weather. # Example: # mars_weather = \ # 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa, daylight 06:09-17:55' # In[17]: from splinter import Browser browser = Browser('chrome', headless=False) tw_acct_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(tw_acct_url) time.sleep(5) # In[18]: html = browser.html soup = BeautifulSoup(html, 'lxml') # print(soup.prettify()) # In[19]: container = soup.find('div', class_='js-tweet-text-container') container # In[20]: mars_weather = container.find( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text mars_weather # ### Mars Facts # In[21]: # Visit the Mars Facts webpage here and use Pandas # to scrape the table containing facts about the planet # including Diameter, Mass, etc. # Use Pandas to convert the data to a HTML table string. # In[22]: marsfacts_url = 'https://space-facts.com/mars/' tables = pd.read_html(marsfacts_url) tables # In[23]: df = tables[0] df # In[24]: df = df.rename(columns={0: 'Measurement', 1: 'Value'}) df = df.set_index('Measurement') df # In[25]: # convert table to html string html_table = df.to_html() html_table # In[26]: # strip unwanted newlines to clean up the table. html_table = html_table.replace('\n', '') html_table # ### Mars Hemisperes # In[27]: # Visit the USGS Astrogeology site here to obtain # high resolution images for each of Mar's hemispheres. # You will need to click each of the links to the hemispheres # in order to find the image url to the full resolution image. # Save both the image url string for the full resolution hemipshere image, # and the Hemisphere title containing the hemisphere name. # Use a Python dictionary to store the data using the keys img_url and title. # Append the dictionary with the image url string and the hemisphere title to a list. # This list will contain one dictionary for each hemisphere. # # Example: # hemisphere_image_urls = [ # {"title": "Valles Marineris Hemisphere", "img_url": "..."}, # {"title": "Cerberus Hemisphere", "img_url": "..."}, # {"title": "Schiaparelli Hemisphere", "img_url": "..."}, # {"title": "Syrtis Major Hemisphere", "img_url": "..."}, # ] # In[28]: from splinter import Browser browser = Browser('chrome', headless=False) usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(usgs_url) time.sleep(5) # In[29]: browser.find_by_css('h3')[0].click() img1_url = browser.find_by_tag('a')[41]['href'] print(img1_url) img1_title = browser.find_by_css('h2')[0].text img1_title = img1_title.replace(' Enhanced', '') print(img1_title) # In[30]: browser.back() browser.find_by_css('h3')[1].click() img2_url = browser.find_by_tag('a')[41]['href'] print(img2_url) img2_title = browser.find_by_css('h2')[0].text img2_title = img2_title.replace(' Enhanced', '') print(img2_title) # In[31]: browser.back() browser.find_by_css('h3')[2].click() img3_url = browser.find_by_tag('a')[41]['href'] print(img3_url) img3_title = browser.find_by_css('h2')[0].text img3_title = img3_title.replace(' Enhanced', '') print(img3_title) # In[32]: browser.back() browser.find_by_css('h3')[3].click() img4_url = browser.find_by_tag('a')[41]['href'] print(img4_url) img4_title = browser.find_by_css('h2')[0].text img4_title = img4_title.replace(' Enhanced', '') print(img4_title) # In[33]: # Use a Python dictionary to store the data using the keys img_url and title. hemisphere_img_dict = [ { "title": img1_title, "img_url": img1_url }, { "title": img2_title, "img_url": img2_url }, { "title": img3_title, "img_url": img3_url }, { "title": img4_title, "img_url": img4_url }, ] data_outputs = { 'title': title, 'paragraph': paragraph, 'featured_image_url': featured_image_url, 'mars_weather': mars_weather, 'html_table': html_table, 'hemisphere_img_dict': hemisphere_img_dict } mars_data.insert(data_outputs) return data_outputs
def scrape_info(): browser = Browser('chrome') mars = {} # # Scraping # # NASA Mars News # pull titles from website url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all('div', class_="content_title") news_title = titles[1].text body = soup.find_all('div', class_="article_teaser_body") news_p = body[0].text mars["news_title"] = news_title mars["news_p"] = news_p # pull body from website mars # pull titles and body from website results = soup.find_all('div', class_="slide") for result in results: titles = result.find('div', class_="content_title") title = titles.find('a').text bodies = result.find('div', class_="rollover_description") body = bodies.find('div', class_="rollover_description_inner").text print('----------------') print(title) print(body) # # JPL Mars Space Images - Featured Image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' response = requests.get(url) browser.visit(url) browser.find_by_id("full_image").click() time.sleep(2) browser.find_link_by_partial_text('more info').click() soup = BeautifulSoup(browser.html, 'html.parser') result = soup.find('figure', class_='lede') featured_image_url = 'https://www.jpl.nasa.gov' + result.a.img["src"] featured_image_url mars["featured_image"] = featured_image_url # # Mars Facts mars_facts_url = "https://space-facts.com/mars/" table = pd.read_html(mars_facts_url) table[0] df = table[0] df.columns = ["Facts", "Value"] df.set_index(["Facts"]) df facts_html = df.to_html() facts_html = facts_html.replace("\n", "") facts_html mars["facts"] = facts_html # # Mars Hemispheres hemisphere_image_urls = [] # Cerberus Hemispheres url = ( 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced' ) response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # print(soup.prettify()) cerberus_img = soup.find_all('div', class_="wide-image-wrapper") # print(cerberus_img) Hemisphere = {} for img in cerberus_img: pic = img.find('li') full_img = pic.find('a')['href'] print(full_img) cerberus_title = soup.find('h2', class_='title').text print(cerberus_title) cerberus_hem = {"Title": cerberus_title, "url": full_img} print(cerberus_hem) Hemisphere["title"] = cerberus_title Hemisphere["img_url"] = cerberus_hem hemisphere_image_urls.append(Hemisphere) # Schiaparelli Hemisphere url = ( 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced' ) response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') #print(soup.prettify()) shiaparelli_img = soup.find_all('div', class_="wide-image-wrapper") # print(shiaparelli_img) for img in shiaparelli_img: pic = img.find('li') full_img = pic.find('a')['href'] print(full_img) shiaparelli_title = soup.find('h2', class_='title').text print(shiaparelli_title) shiaparelli_hem = {"Title": shiaparelli_title, "url": full_img} print(shiaparelli_hem) Hemisphere["title"] = shiaparelli_title Hemisphere["img_url"] = shiaparelli_title hemisphere_image_urls.append(Hemisphere) # Syrtis Hemisphere url = ( 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced' ) response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') #print(soup.prettify()) syrtris_img = soup.find_all('div', class_="wide-image-wrapper") # print(syrtris_img) for img in syrtris_img: pic = img.find('li') full_img = pic.find('a')['href'] print(full_img) syrtris_title = soup.find('h2', class_='title').text print(syrtris_title) syrtris_hem = {"Title": syrtris_title, "url": full_img} print(syrtris_hem) Hemisphere["title"] = syrtris_title Hemisphere["img_url"] = syrtris_hem hemisphere_image_urls.append(Hemisphere) # Valles Marineris Hemisphere url = ( 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced' ) response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') valles_marineris_img = soup.find_all('div', class_="wide-image-wrapper") # print(valles_marineris_img) for img in valles_marineris_img: pic = img.find('li') full_img = pic.find('a')['href'] print(full_img) valles_marineris_title = soup.find('h2', class_='title').text print(valles_marineris_title) valles_marineris_hem = {"Title": valles_marineris_title, "url": full_img} print(valles_marineris_hem) Hemisphere["title"] = valles_marineris_title Hemisphere["img_url"] = valles_marineris_hem hemisphere_image_urls.append(Hemisphere) mars["hemisphere"] = hemisphere_image_urls return mars
def scrape(): mars = mongo.db.mars # Put everything from Jupyter Notebook Here # Set the executable path and initialize the chrome browser in splinter executable_path = { 'executable_path': 'C:\\Users\\enere\\Desktop\chromedriver' } browser = Browser('chrome', **executable_path) ##### MARS NEWS Scrape ##### # Visit the mars nasa news site url = 'https://mars.nasa.gov/news/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Convert the browser html to a soup object and then quit the browser html = browser.html # Convert the browser html to a soup object and then quit the browser html = browser.html news_scraper = BeautifulSoup(html, 'html.parser') # Use the parent element to find the first a tag and save it as `news_title` title_element = news_scraper.find('div', {'class': 'content_title'}) news_title = title_element.get_text() # Use the parent element to find the paragraph text teaser_element = news_scraper.find('div', {'class': 'article_teaser_body'}) teaser_text = teaser_element.get_text() ##### JPL Space Images Featured Image ##### # Visit URL url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) full_image_elem = browser.find_by_id('full_image') full_image_elem.click() # Find the more info button and click that browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() # Parse the resulting html with soup html = browser.html img_scraper = BeautifulSoup(html, 'html.parser') # find the relative image url img_element = img_scraper.find('img', {'class': 'main_image'}) # find the relative image url img_src = img_element.get('src') # Use the base url to create an absolute url img_url = f'https://www.jpl.nasa.gov{img_src}' ##### Mars Weather Scrape # Visit URL url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html weather_soup = BeautifulSoup(html, 'html.parser') # First, find a tweet with the data-name `Mars Weather` mars_weather_tweet = weather_soup.find('div', attrs={ "class": "tweet", "data-name": "Mars Weather" }) # Next, search within the tweet for the p tag containing the tweet text mars_weather = mars_weather_tweet.find('p', 'tweet-text').get_text() mars_weather ##### Mars Facts Scrape # Visit URL url = 'https://space-facts.com/mars/' browser.visit(url) tables = pd.read_html(url) html_table = df.to_html() df.to_html('table.html') type(tables) df = tables[0] df.columns = ['Mars - Earth Comparison', 'Mars', 'Earth'] # Set the index to Mars - Earth Comparison column df.set_index('Mars - Earth Comparison', inplace=True) #convert DataFrames back to HTML tables using the to_html function html_table = df.to_html() ##### Mars Hemisphere Scrape url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemisphere_image_urls = [] # First, get a list of all of the hemispheres links = browser.find_by_css("a.product-item h3") # Next, loop through those links, click the link, find the sample anchor, return the href for i in range(len(links)): hemisphere = {} # We have to find the elements on each loop to avoid a stale element exception browser.find_by_css("a.product-item h3")[i].click() # Next, we find the Sample image anchor tag and extract the href sample_elem = browser.find_link_by_text('Sample').first hemisphere['img_url'] = sample_elem['href'] # Get Hemisphere title hemisphere['title'] = browser.find_by_css("h2.title").text # Append hemisphere object to list hemisphere_image_urls.append(hemisphere) hemisphere_image_urls # Finally, we navigate backwards browser.back() browser.quit() ##### Create a dictionary to store our scraped data scraped_data = { 'News Title': news_title, 'Teaser Text': teaser_text, 'Image URL': img_url, 'Mars Weather': mars_weather, 'Mars Hemisphere': hemisphere_image_urls, 'Mars Facts': html_table } ##### Put into MongoDB mars.update({}, scraped_data, upsert=True) return jsonify(scraped_data)