def featured_image(browser): # Visit URL url = 'https://spaceimages-mars.com' browser.visit(url) # Find and click the full image button full_image_elem = browser.find_by_tag('button')[1] full_image_elem.click() # Parse the resulting html with soup html = browser.html img_soup = soup(html, 'html.parser') # Add try/except for error handling try: # Find the relative image url img_url_rel = img_soup.find('img', class_='fancybox-image').get('src') except AttributeError: return None # Use the base URL to create an absolute URL img_url = f'https://spaceimages-mars.com/{img_url_rel}' return img_url
def mars_news(browser): # Scrape Mars news # Visit the mars nasa news site url = 'https://mars.nasa.gov/news/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Set up the HTML parser. Convert the browser html to a soup object and then quit the browser. html = browser.html news_soup = soup(html, 'html.parser') # Add try/except for error handling try: slide_elem = news_soup.select_one('ul.item_list li.slide') # assign the title and summary text to variables to reference later. Begin scraping: #slide_elem.find("div", class_='content_title') # Use the parent element to find the first 'a' tag and save it as `news_title` news_title = slide_elem.find("div", class_="content_title").get_text() # Use the parent element to find the paragraph text news_p = slide_elem.find("div", class_="article_teaser_body").get_text() except AttributeError: return None, None return news_title, news_p
def featured_image(browser): # Visit URL url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # Find and click the full image button full_image_elem = browser.find_by_id('full_image') full_image_elem.click() # Find the more info button and click that browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.links.find_by_partial_text('more info') more_info_elem.click() # Parse the resulting html with soup html = browser.html img_soup = soup(html, 'html.parser') # Add try/except for error handling try: # Find the relative image url img_url_rel = img_soup.select_one('figure.lede a img').get("src") except AttributeError: return None #img_url_rel # Use the base URL to create an absolute URL img_url = f'https://www.jpl.nasa.gov{img_url_rel}' return img_url
def LoadDataHtml(writedata: list): browser.visit('https://apteka-ot-sklada.ru/catalog?q=' + item['pricepos_code']) soup = BeautifulSoup(browser.html, 'html.parser') data = soup.find_all( "div", class_="ui-card goods-card goods-grid__cell goods-grid__cell_size_3") if len(data) == 0: VerifiedData.append(item['pricepos_code']) jsdata = json.dumps(VerifiedData) temp = open('Verified.json', 'w') temp.write(jsdata) temp.close() return soup = BeautifulSoup(str(data), 'html.parser') html = soup.find_all("a", href=True) browser.visit('https://apteka-ot-sklada.ru' + str(html[0]['href'])) soup = BeautifulSoup(browser.html, 'html.parser') image = soup.find_all( 'img', class_='goods-photo goods-gallery__picture')[0]['src'] image = 'https://apteka-ot-sklada.ru' + image html = soup.find_all('div', class_='custom-html content-html')[0] Description = html.get_text() writedata = dict(pricepos_name=item['pricepos_name'], pricepos_count=item['pricepos_count'], pricepos_value=item['pricepos_value'], pricepos_country=item['pricepos_country'], pricepos_code=item['pricepos_code'], description=Description, image=image) return writedata
def scrape_mars_weather(): try: browser = initialize_browser() weather_url = "https://twitter.com/marswxreport?lang=en" browser.visit(weather_url) html_weather = browser.html soup = bs(html_weather, "html.parser") recent_tweets = soup.find_all("div", class_="js-tweet-text-container") for tweet in recent_tweets: weather_tweet = tweet.find("p").text if "Sol" and "pressure" in weather_tweet: print(weather_tweet) break else: pass mars_information["weather_tweet"] = weather_tweet return mars_information finally: browser.quit()
def mars_news(browser): # Scrape Mars News # Visit the mars nasa news site url = 'https://redplanetscience.com' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css('div.list_text', wait_time=1) # Convert the browser html to a soup object and then quit the browser html = browser.html news_soup = soup(html, 'html.parser') # Add try/except for error handling try: slide_elem = news_soup.select_one('div.list_text') # Use the parent element to find the first `a` tag and save it as `news_title` news_title = slide_elem.find('div', class_='content_title').get_text() # Use the parent element to find the paragraph text news_p = slide_elem.find('div', class_='article_teaser_body').get_text() except AttributeError: return None, None return news_title, news_p
def mars_weather(browser): insight_url = 'https://mars.nasa.gov/insight/weather/' browser.visit(insight_url) # Parse the data html = browser.html weather_soup = soup(html, 'html.parser') # Scrape the Daily Weather Report table weather_table = weather_soup.find('table', class_='mb_table') print(weather_table.prettify()) #<a class="inline_image_enlarge fancybox" href="https://mars.nasa.gov/rss/api/images/insight_marsweather_white.png"> #<img alt="Three Day Weather Report" src="https://mars.nasa.gov/rss/api/images/insight_marsweather_white.png"> # # Find the relative image url #img_url_rel = weather_soup.select_one('a', class_='inline_image_enlarge fancybox').get('src') #weather_img_url = f'https://www.jpl.nasa.gov{img_url_rel}' return weather_table.prettify()
def scrape_mars_news(): try: browser = initialize_browser() url = "https://mars.nasa.gov/news/" browser.visit(url) html = browser.html soup = bs(html, "html.parser") news_headline = soup.find("div", class_="content_title").find("a").text news_story = soup.find("div", class_="article_teaser_body").text mars_information["news_headline"] = news_headline mars_information["news_story"] = news_story return mars_information finally: browser.quit()
def scrape_mars_image(): try: browser = initialize_browser() url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_image) html_image = browser.html soup = bs(html_image, "html.parser") featured_image_url = soup.find("article")["style"].replace( "background-image: url(", "").replace(");", "")[1:-1] main_url = "https://www.jpl.nasa.gov" featured_image_url = main_url + featured_image_url featured_image_url mars_information["featured_image_url"] = featured_image_url return mars_information finally: browser.quit()
def hemispheres(browser): # 1. Use browser to visit the URL url = 'https://marshemispheres.com/' browser.visit(url) # 2. Create a list to hold the images and titles. hemisphere_image_urls = [] # 3. Write code to retrieve the image urls and titles for each hemisphere. # Initialize for loop to loop through each hemisphere link for link in range(4): # Empty dictionary to hold image URLs and titles hemispheres = {} # Find and click on each hemisphere link hemisphere_link = browser.find_by_tag('a.itemLink h3')[link] hemisphere_link.click() # Navigate to full resolution image and retrieve the full resolution image URL image = browser.links.find_by_text('Sample') img_url = image['href'] # Retrieve title for hemisphere image title = browser.find_by_tag('h2').text # Add hemisphere image URL and title to dictionary hemispheres["img_url"] = img_url hemispheres["title"] = title # Add dictionary to list hemisphere_image_urls.append(hemispheres) # Navigate back to the beginning to get the next hemisphere image browser.back() return hemisphere_image_urls
def scrape(): browser = init_browser() url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(2) #%% [markdown] # # NASA Mars News #%% #%% #%% #%% [markdown] # # JPL Mars Space Images - Featured Image #%% featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(featured_image_url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') #%% featured_image = soup.find('article')['style'].replace( 'background-image: url(', '').replace(');', '') #%% #Slice the url to only include the text inside the hyphens featured_image = featured_image[1:-1] #%% parent_url = 'https://www.jpl.nasa.gov' #%% image_url = parent_url + featured_image #%% [markdown] # # Mars Weather #%% mars_weather_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_weather_url) time.sleep(2) html = browser.html soup = BeautifulSoup(html, 'html.parser') #%% current_tweet = soup.find_all('div', class_='js-tweet-text-container')
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) mars_mission = {} # URL of page to be scraped url = 'https://mars.nasa.gov/news/' browser.visit(url) # Scrape page into soup html = browser.html soup = bs(html, 'html.parser') news_title = soup.find("div", class_="content_title").text news_p = soup.find("div", class_="rollover_description_inner").text # Visit the url for JPL Featured Space Image url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url2) # Scrape page into soup html = browser.html soup = bs(html, 'html.parser') image = soup.find_all('img')[5]["src"] featured_img_url = "https://jpl.nasa.gov" + image print(featured_img_url) # scrape the latest Mars weather tweet from the page url3 = "https://twitter.com/marswxreport?lang=en" browser.visit(url3) # Scrape page into soup html = browser.html soup = bs(html, 'html.parser') # getting mars weather mars_weathers = [] for weather_info in soup.find_all( 'p', class_= 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'): mars_weathers.append(weather_info.text.strip()) mars_weather = mars_weathers[0] mars_weather # Visit the Mars Facts webpage url4 = 'https://space-facts.com/mars/' browser.visit(url4) # Scrape page into soup html = browser.html soup = bs(html, 'html.parser') # getting mars facts table mars_facts = pd.read_html(url4)[0] mars_facts.columns = ["Description", "Value"] mars_facts_html = mars_facts.to_html() mars_facts_html.replace('\n', '') mars_facts.to_html('mars_table.html') # Mars hemisphere name and images url5 = 'https://astrogeology.usgs.gov' hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemispheres_url) hemispheres_html = browser.html hemispheres_soup = bs(hemispheres_html, 'html.parser') # Mars hemispheres products data all_mars_hemispheres = hemispheres_soup.find('div', class_='collapsible results') mars_hemispheres = all_mars_hemispheres.find_all('div', class_='item') hemisphere_img_urls = [] # Iterate through each hemisphere data for i in mars_hemispheres: # Collect Title hemisphere = i.find('div', class_="description") title = hemisphere.h3.text # Collect image link by browsing to hemisphere page hemisphere_link = hemisphere.a["href"] browser.visit(url5 + hemisphere_link) img_html = browser.html img_soup = bs(img_html, 'html.parser') img_link = img_soup.find('div', class_='downloads') img_url = img_link.find('li').a['href'] # Create Dictionary to store title and url info img_dict = {} img_dict['title'] = title img_dict['img_url'] = img_url hemisphere_img_urls.append(img_dict) mars_mission = { "news_title": news_title, "news_p": news_p, "featured_image_url": featured_img_url, "mars_weather": mars_weather, "mars_facts_table": str(mars_facts_html), "hemisphere_img": hemisphere_img_urls } return mars_mission
def Scrape(): # Empty dictionary mars_dict = {} # ## NASA Mars News # Mars News URL url = "https://mars.nasa.gov/news/" # Retrieve page with the requests module html = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(html.text, 'html.parser') # Get title & description news_title = soup.find('div', 'content_title').text news_p = soup.find('div', 'rollover_description_inner').text # Adding to dict mars_dict["news_title"] = news_title mars_dict["news_p"] = news_p print("NEWS TITLE & DESCRIPTION") # ## JPL Mars Space Images # JPL Mars URL # Setting up splinter executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_image) #Getting the base url from urllib.parse import urlsplit base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(url_image)) print(base_url) #Design an xpath selector to grab the image xpath = "//*[@id=\"page\"]/section[3]/div/ul/li[1]/a/div/div[2]/img" #Use splinter to click on the mars featured image #to bring the full resolution image results = browser.find_by_xpath(xpath) img = results[0] img.click() #get image url using BeautifulSoup html_image = browser.html soup = bs(html_image, "html.parser") img_url = soup.find("img", class_="fancybox-image")["src"] featured_image_url = base_url + img_url print(featured_image_url) mars_dict["featured_image_url"] = featured_image_url print("FEATURED IMAGE") #3. Mars weather #get mars weather's latest tweet from the website url_weather = "https://twitter.com/marswxreport?lang=en" browser.visit(url_weather) html_weather = browser.html soup = bs(html_weather, "html.parser") #temp = soup.find('div', attrs={"class": "tweet", "data-name": "Mars Weather"}) mars_weather = soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text print(mars_weather) mars_dict["mars_weather"] = mars_weather
def scrape(): executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) Nasa_News_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(Nasa_News_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') #scrape latest news title news_title = soup.find_all('div', class_='content_title') latest_title = news_title[1].text #print(latest_title) #scrape latest news article teaser news_teaser = soup.find_all('div', class_="article_teaser_body") latest_teaser = news_teaser[0].text #print(latest_teaser) #scrape JPL Mars featured image JPL_Mars_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(JPL_Mars_url) #click buttons to load image webpage browser.click_link_by_id("full_image") time.sleep(3) browser.click_link_by_partial_text("more info") # make a new soup html=browser.html soup=BeautifulSoup(html, "html.parser") sub_img = soup.find("figure", class_="lede") name=sub_img.a["href"] featured_image="https://www.jpl.nasa.gov" + name #featured_image USGS_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(USGS_url) html=browser.html soup=BeautifulSoup(html, "html.parser") hemi_list = [] hemispheres = soup.find_all("div", class_="item") for hemi in hemispheres: #for titles hemi_div = hemi.find("div", class_="description") hemi_title = hemi_div.a.h3.text #print(hemi_title) #click link for images browser.click_link_by_partial_text("Hemisphere Enhanced") time.sleep(3) # scrape image html=browser.html soup_4=BeautifulSoup(html, "html.parser") usgs_open = soup_4.find("img", class_="wide-image") usgs_src=usgs_open["src"] hemi_image_url="https://www.astrogeology.usgs.gov" + usgs_src #print(hemi_image) hemi_list.append({"title": hemi_title, "img_url": hemi_image_url}) mars_scrape_data = { 'Latest Headline': latest_title, latest_teaser, 'Featured Image': featured_image, hemi_list} return mars_scrape_data
# 3.个人信息 # 4.可用教室 # 5.选课 # 6.评教 targetpage = r"http://121.194.57.131/loginAction.do" #0 my_class_url = r'http://121.194.57.131/xskbAction.do?actionType=1' #1 my_grade_url = r'http://121.194.57.131/gradeLnAllAction.do?type=ln&oper=qb' #2 my_info_url = r'http://121.194.57.131/xjInfoAction.do?oper=xjxx' #3 ava_classroom_url = r'http://121.194.57.131/oneDayJasAction.do?oper=tjcx' #4 cho_class_url = r'http://121.194.57.131/xkMainAction.do?actionType=6' #5 list_wj_url = r'http://121.194.57.131/jxpgXsAction.do?oper=listWj' #6 cho_bxk_url = r'http://121.194.57.131/zytzAction.do?oper=bxqkc' #7 start = time.clock() browser = splinter.Browser() browser.visit(jw_log_url) if len(sys.argv) == 1: print u''' # 0.登陆\n # 1.课程表\n # 2.成绩\n # 3.个人信息\n # 4.可用教室\n # 5.选课\n # 6.评教\n ''' student_id = raw_input("ID: ") passWord = getpass.getpass(r"password(jwc.bjtu): ") choice = int(raw_input("choice: ")) elif len(sys.argv) == 2:
def scrape(): dict_data = {} browser = init_browser() url = "https://mars.nasa.gov/news/" jpl_url = "https://www.jpl.nasa.gov/images?search=&category=Mars" mars_url = "https://space-facts.com/mars/" hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') titles_body_soup = soup.find_all("div", class_=("content_title", "article_teaser_body")) #display(len(titles_body_soup)) title_list = [] news_list = [] for i, x in enumerate(titles_body_soup): if i % 2 == 0: news_list.append(x.text) else: title_list.append(x.text) #display(len(title_list)) #display(len(news_list)) #news list has extra line due to div content_title from nav bar element news_list = news_list[1:49] #display(len(news_list)) browser.visit(jpl_url) time.sleep(.5) browser.find_by_css("img.BaseImage").click() browser.find_by_css("svg.IconExpand").click() jpl_html = browser.html soup0 = BeautifulSoup(jpl_html, "html.parser") featured_image_jpg = soup0.find_all( "div", class_="BaseLightbox__slide__img")[0]("img")[0]["src"] browser.visit(mars_url) mars_table = pd.read_html(mars_url) planet_comparison_df = mars_table[1].set_index("Mars - Earth Comparison") #display(planet_comparison_df) mars_facts = mars_table[0].rename(columns=({ 0: "Description", 1: "Mars" })).set_index("Description") #display(mars_facts) mars_html = mars_facts.to_html() hemisphere_dict_list = [] hemisphere_images_urls = {} browser.visit(hemi_url) time.sleep(.5) for x in range(4): browser.find_by_css("img.thumb")[x].click() browser.find_by_css("a.open-toggle").click() large_hemi_html = browser.html hemi_soup = BeautifulSoup(large_hemi_html, "html.parser") title = hemi_soup("h2", class_="title")[0].text hemisphere_images_urls["title"] = title.replace(" Enhanced", "") hemisphere_images_urls["img_url"] = hemi_soup( "img", class_="wide-image")[0]["src"] hemisphere_dict_list.append(hemisphere_images_urls) browser.visit(hemi_url) hemisphere_images_urls = {} hemisphere_dict_list browser.quit() dict_data["article_title"] = title_list[0] dict_data["news_list"] = news_list[0] dict_data["featured_image"] = featured_image_jpg dict_data["mars_table"] = mars_html dict_data["hemisphere_dict_list"] = hemisphere_dict_list from pymongo import MongoClient mongo_conn = MongoClient('mongodb://localhost:27017') mars_db = mongo_conn["mars_db"] mars_coll = mars_db["mars"] mars_db.mars_coll.insert_one(dict_data) return dict_data
def scrape(): #scrape the NASA Mars News SIte, collect news title, paragraph text, assign #to variables for later reference url = "https://mars.nasa.gov/news/" response = req.get(url) soup = bs(response.text, 'html5lib') #scrape the title and accompanying paragraph news_title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="rollover_description_inner").text #Visit the URL for JPL's Space images #splinter to navigate the site and find the image url for the current featured #image and assign it to featured_image_url (use .jpg) #set up splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) #stir soup for scraping html = browser.html soup = bs(html, "html.parser") #have webdriver click links to get to the full image I want browser.click_link_by_partial_text('FULL IMAGE') #had to add this, wasn't working and docs recommended waiting between clicks time.sleep(5) browser.click_link_by_partial_text('more info') #stir new soup for scraping the image url new_html = browser.html new_soup = bs(new_html, 'html.parser') temp_img_url = new_soup.find('img', class_='main_image') back_half_img_url = temp_img_url.get('src') recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA #stir soup twitter_response = req.get("https://twitter.com/marswxreport?lang=en") twitter_soup = bs(twitter_response.text, 'html.parser') #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol" tweet_containers = twitter_soup.find_all('div', class_="js-tweet-text-container") for i in range(10): tweets = tweet_containers[i].text if "Sol " in tweets: mars_weather = tweets break #Mars Facts....visit webpage, use pandas to scrape the page for facts, #convert pandas table to html table string. request_mars_space_facts = req.get("https://space-facts.com/mars/") #use pandas to scrape html table data mars_space_table_read = pd.read_html(request_mars_space_facts.text) df = mars_space_table_read[0] #set the index to the titles of each statistic/value df.set_index(0, inplace=True) mars_data_df = df #convert new pandas df to html, replace "\n" to get html code mars_data_html = mars_data_df.to_html() mars_data_html.replace('\n', '') mars_data_df.to_html('mars_table.html')
def scrape_all(): browser = init_browser() browser.visit('https://mars.nasa.gov/news/') html = browser.html news_soup = BeautifulSoup(html, 'lxml') title = news_soup.find_all('div', class_='content_title') #place results in designated variables to be used later news_title = title[1].text.strip() print(news_title) parag = news_soup.find_all('div', class_='article_teaser_body') news_p = parag print(news_p) # JPL Mars Space Images - Featured Image browser.visit( "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars") time.sleep(3) browser.click_link_by_partial_text('FULL IMAGE') browser.click_link_by_partial_text('more info') feat_html = browser.html feat_soup = BeautifulSoup(feat_html, 'html.parser') mars_img_url = feat_soup.find('figure', class_='lede').a['href'] orig_url = "https://www.jpl.nasa.gov" featured_image_url = orig_url + mars_img_url print(f"{featured_image_url}") time.sleep(2) # Mars Facts mars_facts_url = 'https://space-facts.com/mars/' time.sleep(3) tables_found = pd.read_html(mars_facts_url) mars_facts_df = tables_found[0] mars_facts_df.head() #mars_html_table = mars_facts_df.to_html(classes='data table', index=False, header=False, border=0) mars_html_table = mars_facts_df.to_html() print(mars_html_table) # Mars Hemispheres #browser = Browser('chrome', **executable_path, headless=False) #hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit( "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" ) hemis_html = browser.html hemis_soup = BeautifulSoup(hemis_html, 'html.parser') hemis_orig_url = 'https://astrogeology.usgs.gov' hemisphere_urls = [] hemis_items = hemis_soup.find_all('div', class_='item') # FOR loop to process titles and urls in a dictionary for item in hemis_items: title = item.find('h3').text partial_img_url = item.find('a', class_='itemLink product-item')['href'] browser.visit(hemis_orig_url + partial_img_url) prev_html = browser.html hemis_soup = BeautifulSoup(prev_html, 'html.parser') img_url = hemis_orig_url + hemis_soup.find('img', class_='wide-image')['src'] hemisphere_urls.append({"title": title, "img_url": img_url}) #print(f"{hemisphere_urls[item]}") # save all the compiled data about mars in a dictionary mars_dictionary = { "latest_news_title": news_title, "latest_news_parag": news_p, "JPL_featured_image": featured_image_url, "mars_facts_table": mars_html_table, "hemisphere_images": hemisphere_urls } #for debugging only # print("this is my mars dictionary") # print(f"[latest_news_title]") # print(f"[latest_news_parag]") # print(f"[JPL_featured_image]") # print(f"[mars_facts_table]") # print(f"[hemisphere_images]") # close browser browser.quit() return mars_dictionary
def scrape(): #scrape the NASA Mars News SIte, collect news title, paragraph text, assign #to variables for later reference url = "https://mars.nasa.gov/news/" response = req.get(url) soup = bs(response.text, 'html5lib') #scrape the title and accompanying paragraph news_title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="rollover_description_inner").text #Visit the URL for JPL's Space images #splinter to navigate the site and find the image url for the current featured #image and assign it to featured_image_url (use .jpg) #set up splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) #stir soup for scraping html = browser.html soup = bs(html, "html.parser") #have webdriver click links to get to the full image I want browser.click_link_by_partial_text('FULL IMAGE') #had to add this, wasn't working and docs recommended waiting between clicks time.sleep(5) browser.click_link_by_partial_text('more info') #stir new soup for scraping the image url new_html = browser.html new_soup = bs(new_html, 'html.parser') temp_img_url = new_soup.find('img', class_='main_image') back_half_img_url = temp_img_url.get('src') recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA #stir soup twitter_response = req.get("https://twitter.com/marswxreport?lang=en") twitter_soup = bs(twitter_response.text, 'html.parser') #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol" tweet_containers = twitter_soup.find_all('div', class_="js-tweet-text-container") for i in range(10): tweets = tweet_containers[i].text if "Sol " in tweets: mars_weather = tweets break #Mars Facts....visit webpage, use pandas to scrape the page for facts, #convert pandas table to html table string. request_mars_space_facts = req.get("https://space-facts.com/mars/") #use pandas to scrape html table data mars_space_table_read = pd.read_html(request_mars_space_facts.text) df = mars_space_table_read[0] #set the index to the titles of each statistic/value df.set_index(0, inplace=True) mars_data_df = df #convert new pandas df to html, replace "\n" to get html code mars_data_html = mars_data_df.to_html() mars_data_html.replace('\n', '') mars_data_df.to_html('mars_table.html') #..Visit the USGS Astrogeology site to obtain hgih resolution images for #....each of Mar's hemispheres usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" usgs_req = req.get(usgs_url) #..You will need to click each of the links to the hemispheres in order #....to find full res image #had trouble doing this with splinter, decided to just do a bunch of loops for img urls soup = bs(usgs_req.text, "html.parser") hemi_attributes_list = soup.find_all('a', class_="item product-item") #list to keep the dictionaries that have title and image url hemisphere_image_urls = [] for hemi_img in hemi_attributes_list: #get the img title img_title = hemi_img.find('h3').text #print(img_title) #get the link to stir another soup, this is the page with the actual image url link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href'] #print(link_to_img) img_request = req.get(link_to_img) soup = bs(img_request.text, 'lxml') img_tag = soup.find('div', class_='downloads') img_url = img_tag.find('a')['href'] hemisphere_image_urls.append({ "Title": img_title, "Image_Url": img_url }) mars_data = { "News_Title": news_title, "Paragraph_Text": paragraph_text, "Most_Recent_Mars_Image": recent_mars_image_url, "Mars_Weather": mars_weather, "mars_h": hemisphere_image_urls } return mars_data
current_tweet = soup.find_all('div', class_='js-tweet-text-container') #%% for tweet in current_tweet: mars_weather_tweet = tweet.find('p').text if 'sol' and 'pressure' in mars_weather_tweet: print(mars_weather_tweet) print('-------------') #%% [markdown] # # Mars Facts #%% mars_facts_url = 'https://space-facts.com/mars/' browser.visit(mars_facts_url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') #%% mars_facts_second_table = pd.read_html(mars_facts_url)[1] mars_facts_second_table = mars_facts_second_table.rename(index=str, columns={ 0: "Description", 1: "Value" }) mars_facts_second_tablehtml = mars_facts_second_table.to_html( index='False')
title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="rollover_description_inner").text # In[ ]: print(paragraph_text) # In[128]: #Visit the URL for JPL's Space images #splinter to navigate the site and find the image url for the current featured #image and assign it to featured_image_url (use .jpg) executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) featured_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) # In[129]: html = browser.html soup = bs(html, "html.parser") # In[130]: browser.click_link_by_partial_text('FULL IMAGE') #time.sleep(5) # In[131]: browser.click_link_by_partial_text('more info')
def scrape(): executable_path = {"executable_path": "chromedriver"} browser = Browser("chrome", **executable_path) mars_scrape_data = {} # NASA URL of page to be scraped url = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object soup = bs(response.text, 'lxml') #Featured article # Identify and return title and paragraph news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='rollover_description_inner').text # Insert into data table mars_scrape_data['news_title'] = news_title mars_scrape_data['news_p'] = news_p # JPL URL of page to be scraped jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl_url) jpl_html = browser.html # Create BeautifulSoup object; parse with 'html.parser' soup = bs(jpl_html, 'html.parser') # scraping JPL feature image image = soup.find("article", class_="carousel_item") div = image.find("div", class_="default floating_text_area ms-layer") footer = div.find('footer') image = footer.a['data-fancybox-href'] featured_image_url = "https://www.jpl.nasa.gov" + image mars_scrape_data['featured_image_url'] = featured_image_url # TWITTER Mars Weather to be scraped twitter_url = 'https://twitter.com/marswxreport?lang=en' twitter_response = requests.get(twitter_url) twitter_soup = bs(twitter_response.text, 'lxml') twitter_result = twitter_soup.find('div', class_='js-tweet-text-container') mars_weather = twitter_result.find('p', class_='js-tweet-text').text mars_scrape_data['mars_weather'] = mars_weather # MARS Space Facts mars_facts_url = 'https://space-facts.com/mars/' tables = pd.read_html(mars_facts_url) df = tables[0] df.columns = ['Description', 'Value'] df.set_index('Description', inplace=True) # Export scraped table into an html script mars_facts_table = df.to_html() mars_facts_table.replace("\n", "") df.to_html('mars_facts_table.html') # Store html file to dictionary mars_scrape_data['mars_facts_table'] = mars_facts_table # USGS URL of page to be scraped usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(usgs_url) usgs_html = browser.html # Create BeautifulSoup object; parse with 'html.parser' usgs_soup = bs(usgs_html, 'html.parser') #print(usgs_soup.prettify()) usgs_images = usgs_soup.find_all('div', class_='description') #usgs_images # Loop through images to extract title and image link for usgs_image in usgs_images: title = usgs_image.find('h3').text image_url = "https://astrogeology.usgs.gov" + usgs_image.a['href'] mars_dict = {"title": title, "image_url": image_url} mars_data.append(mars_dict) mars_scrape_data["marsdata"] = mars_data return mars_scrape_data
temp = open('Authorization.json', 'w') temp.write(jsdata) temp.close() CheckDataAuthorization() temp = open("Authorization.json", 'r') data = json.loads(temp.read()) #executable_path = {'executable_path': os.getcwd()+"/chromedriver"} login = data['login'] password = data['password'] browser = browser.Browser('chrome', incognito=True) browser.visit('https://zakaz.godovalov.ru/') browser.reload() browser.fill('login', login) browser.fill('password', password) browser.click_link_by_id('ext-gen1022') browser.visit('https://zakaz.godovalov.ru/ordersale') time.sleep(20) browser.visit('https://zakaz.godovalov.ru/priceposlist_json?ordersale_id')
student_id = raw_input("ID: ") passWord = getpass.getpass("password: "******"choice: ")) elif len(sys.argv) == 2: student_id = '14281023' passWord = '******' choice = int(sys.argv[1]) elif len(sys.argv) == 4: student_id = sys.argv[1] passWord = sys.argv[2] choice = int(sys.argv[3]) browser = splinter.Browser() browser.visit(targetpage) # time.sleep(2) browser.fill('zjh',student_id) browser.fill('mm',passWord) #验证码 v_yzm = '' vrifycodeUrl = "http://121.194.57.131/validateCodeAction.do?" #提取验证码text while True: file = urllib2.urlopen(vrifycodeUrl) pic = file.read() picName = u'urf_login_temp.jpg' localpic = open(picName,"wb") localpic.write(pic) localpic.close()
student_id = '14281023' passWord = '******' choice = int(sys.argv[1]) elif len(sys.argv) == 4: student_id = sys.argv[1] passWord = sys.argv[2] choice = int(sys.argv[3]) #创建临时目录 try: os.mkdir('temp') except: pass browser = splinter.Browser() browser.visit(targetpage) # time.sleep(2) browser.fill('zjh', student_id) browser.fill('mm', passWord) #识别错误的验证码图片 error_image = Image.new("RGB", [100, 100], (255, 255, 255)) error_time = 0 def getCode(rand_code): global error_image global error_time #验证码 v_yzm = '' #列出目录下的文件,获取截图文件
student_id = '14281023' passWord = '******' choice = int(sys.argv[1]) elif len(sys.argv) == 4: student_id = sys.argv[1] passWord = sys.argv[2] choice = int(sys.argv[3]) #创建临时目录 try: os.mkdir('temp') except: pass browser = splinter.Browser() browser.visit(targetpage) # time.sleep(2) browser.fill('zjh',student_id) browser.fill('mm',passWord) #识别错误的验证码图片 error_image = Image.new("RGB", [100,100], (255,255,255)) error_time = 0 def getCode(rand_code): global error_image global error_time #验证码 v_yzm = '' #列出目录下的文件,获取截图文件 files = os.listdir('temp') picName = u'temp/'
def scrape(): browser = init_browser() # Mars News # assign url; use browser to 'get' url url_news = "https://mars.nasa.gov/news/" browser.visit(url_news) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Use .find to locate the "div" and "class" and return only text news_title = soup.find('div', class_='content_title') news_p = soup.find('div', class_='article_teaser_body') # print text to confirm print(news_title.text) print(news_p.text) # Mars image # assign url; use browser to 'get' url url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_image) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Use splinter to navigate the site and find the image url for the current Featured Mars Image image = soup.find('img', class_='fancybox-image') footer = soup.find("footer") link = footer.find('a') # assign the url string to a variable called `featured_image_url`. featured_image_url = link['data-fancybox-href'] # save a complete url string for this image print('https://www.jpl.nasa.gov/' + featured_image_url) # Mars weather # assign url; use browser to 'get' url url_mars_weather = "https://twitter.com/marswxreport?lang=en" browser.visit(url_mars_weather) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # using regular expressions .compile function mars_weather = re.compile(r'sol') mars_weather = soup.find('span', text=mars_weather).text print(mars_weather) # Mars facts # assign url; use browser to 'get' url url_facts = "https://space-facts.com/mars/" browser.visit(url_facts) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # making a get request response = requests.get(url_facts) # scrape the table data from 'table' and 'id' elements mars_facts = soup.find('table', id="tablepress-p-mars-no-2").text print(mars_facts) # Mars table data table = pd.read_html(url_facts) table df = table[0] html_table = df.to_html() html_table df.to_html('table.html') # Mars Hemisphere # assign url; use browser to 'get' url url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_hemisphere) time.sleep(1) # create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # variable to locate the 'div' with 'item' from url items = soup.find_all('div', class_='item') # create an empty list to store results url_hemisphere_img = [] # set variable to visit main url main_url = 'https://astrogeology.usgs.gov' # create a for loop for i in items: # locate titles title = i.find('h3').text # locate first partial img_url partial_img_url = i.find('a', class_='itemLink product-item')['href'] # return to main_url; then partial img url browser.visit(main_url + partial_img_url) # initiate new html browser partial_img_html = browser.html # use beautiful soup and splinter to scrape each page soup = BeautifulSoup(partial_img_html, 'html.parser') # set variable to find full 'img' & 'src' urls img_url = main_url + soup.find('img', class_='wide-image')['src'] # append titles and imgs; return as a list of dictionaries url_hemisphere_img.append({"title": title, "img_url": img_url}) url_hemisphere_img # Close the browser after scraping browser.quit() # Store data in a dictionary mars_info = { "News Title": news_title, "News Paragraph": news_p, "Featured Image": featured_image_url, "Mars Weather": mars_weather, "Mars Facts": mars_facts, "Mars Table": table, "Mars Hemisphere": url_hemisphere_img } # Return results return mars_info
# In[25]: def init_browser(): executable_path = {"executable_path": "chromedriver.exe"} return Browser("chrome", **executable_path, headless=False) # In[26]: # NASA Mars News browser = init_browser() # NASA Mars News Site url = "https://mars.nasa.gov/news/" browser.visit(url) time.sleep(3) # Scrape Page html = browser.html soup = bs(html, "html.parser") # News news = soup.find_all('div', class_="list_text")[0] # Title news_title = news.find(class_="content_title").text # News Article news_p = news.find(class_="article_teaser_body").text
# # Find the relative image url #img_url_rel = weather_soup.select_one('a', class_='inline_image_enlarge fancybox').get('src') #weather_img_url = f'https://www.jpl.nasa.gov{img_url_rel}' return weather_table.prettify() # Deliverable-1, Scrape High-Resolution Mars Hemisphere Images and Titles # Initiate headless driver for deployment (initialize the browser) browser = Browser("chrome", executable_path="chromedriver", headless=False) # 1. Use browser to visit the URL long_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(long_url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=2) # main_url short_url = 'https://astrogeology.usgs.gov' # 2. Create a list to hold the images and titles. hemisphere_image_urls = [] # 3. Write code to retrieve the image urls and titles for each hemisphere. browser.visit(long_url) html = browser.html hemi_soup = soup(html, 'html.parser') main_url = hemi_soup.find_all('div', class_='item') titles = []
def scrape(): #scrape the NASA Mars News SIte, collect news title, paragraph text, assign #to variables for later reference url = "https://mars.nasa.gov/news/" response = req.get(url) soup = bs(response.text, 'html5lib') #Scrape for news item news_title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="rollover_description_inner").text # JPL's Space images executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) #call soup html = browser.html soup = bs(html, "html.parser") #auto click through to full image browser.click_link_by_partial_text('FULL IMAGE') time.sleep(3) browser.click_link_by_partial_text('more info') #soup gets image url new_html = browser.html new_soup = bs(new_html, 'html.parser') temp_img_url = new_soup.find('img', class_='main_image') recent_mars_image_url = "https://www.imagecache.jpl.nasa.gov/images/640x350/PIA18605-16-640x350.jpg" #getdata from Twitter for Mars Weather twitter_req = req.get("https://twitter.com/marswxreport?lang=en") twitter_bs = bs(twitter_req.text, 'html.parser') tweet_output = twitter_bs.find_all('div', class_="js-tweet-text-container") for i in range(10): tweets = tweet_output[i].text if "Sol " in tweets: mars_weather = tweets break #MARS FACTS. request_mars_facts = req.get("https://space-facts.com/mars/") mars_table = pd.read_html(request_mars_facts.text) mars_df = mars_table[0] mars_df.set_index(0, inplace=True) mars_df2 = mars_df mars_data_html = mars_df2.to_html() mars_data_html.replace('\n', '') mars_df2.to_html('mars_table.html') #Get pics of Mars' hemispheres usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" usgs_req = req.get(usgs_url) soup = bs(usgs_req.text, "html.parser") hemis_list = soup.find_all('a', class_="itemLink product-item") hemisphere_image_urls = [] for hemi_img in hemis_list: img_title = hemi_img.find('h3').text link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href'] img_request = req.get(link_to_img) soup = bs(img_request.text, 'lxml') img_tag = soup.find('div', class_='downloads') img_url = img_tag.find('a')['href'] hemisphere_image_urls.append({ "Title": img_title, "Image_Url": img_url }) mars_data = { "News_Title": news_title, "Paragraph_Text": paragraph_text, "Most_Recent_Mars_Image": recent_mars_image_url, "Mars_Weather": mars_weather, "mars_h": hemisphere_image_urls } return mars_data
def scrape(): #Create Dictionary for Mongo mars_info = {} #Mars News browser = init_browser() nasa_url = 'https://mars.nasa.gov/news/' browser.visit(nasa_url) html = browser.html soup = bs(html, 'html.parser') #Scrape for Most Recent Article then store variable latest_article = soup.find("div", "list_text") news_title = latest_article.find("div", class_="content_title").text news_p = latest_article.find("div", class_="article_teaser_body").text #Add Dictionary mars_info["news_title"] = news_title mars_info["teaser"] = news_p # Space Image Site jpl_url = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(jpl_url) # JPL Mars Scrape for requested image html = browser.html soup = bs(html, 'html.parser') carousel = soup.find('div', class_= 'carousel_items') div_style = carousel.find('article')['style'] style = cssutils.parseStyle(div_style) partial_url = style['background-image'] # Cleaning up image url - Per Recommendation of Learning Assistance partial_url = partial_url.replace('url(', '').replace(')', '') featured_image_url = "https://jpl.nasa.gov" + partial_url #print(featured_image_url) # Adding to dictionary - Per Learning Assistant need to remember (needs to be done for images also) mars_info["featured_image_url"] = featured_image_url # Twitter Navigation for Information tweet_url = "https://twitter.com/marswxreport?lang=en" browser.visit(tweet_url) # Most Recent Tweet for Weather html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_weather = soup.find("p", class_="tweet-text").text print(mars_weather) # Adding to dictionary again mars_info["mars_weather"] = mars_weather # Mars Fact Site Navigation facts_url = "https://space-facts.com/mars/" browser.visit(facts_url) # Using Panda for Scrape facts = pd.read_html(facts_url) # DataFrame List for Specific Information facts_df = pd.DataFrame(facts[0]) facts_df.columns=['Fact','Result'] # DataFrame utilized for HTML mars_table = facts_df.to_html(index=False, justify='left', classes='mars-table') mars_table = mars_table.replace('\n', ' ') # Adding to dictionary - again mars_info["mars_table"] = mars_table # Going to Site for Image hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemi_url) # Loop to scrape image info with time delay to account for browser navigation hemisphere_image_urls = [] for i in range (4): time.sleep(5) images = browser.find_by_tag('h3') images[i].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') partial_url = soup.find("img", class_="wide-image")["src"] image_title = soup.find("h2",class_="title").text image_url = 'https://astrogeology.usgs.gov'+ partial_url image_dict = {"title":image_title,"image_url":image_url} hemisphere_image_urls.append(image_dict) browser.back() # Adding to dictionary again mars_info["hemispheres"] = hemisphere_image_urls # Quit browser - Per Learning Assistant recommended browser.quit