def scrapeNPS(): ##### CACHE FILENAME = "nps_cache.json" program_cache = Cache(FILENAME) url = "https://www.nps.gov" + "/index.htm" data = program_cache.get(url) if not data: data = requests.get(url).text program_cache.set(url, data, expire_in_days=1) soup = BeautifulSoup(data, "html.parser") ##### Get all state links state_lst = [] for link in soup.find_all('a'): if '/state/' in link['href']: # print(link['href']) state_lst.append(link['href']) ##### Creating a new CSV called 'park_info' new_file = open('park_info.csv', 'w', encoding='utf8') new_file.write('name,type,location,description,state') new_file.write('\n') for states in state_lst: ##### Cache by states name = states.split("/") cache_each_state = "nps_cache_" + name[2] + ".json" program_cache = Cache(cache_each_state) url = "https://www.nps.gov" + states data = program_cache.get(url) if not data: data = requests.get(url).text program_cache.set(url, data, expire_in_days=1) soup = BeautifulSoup(data, "html.parser") ##### Scrap state's name and all parks state = soup.find("h1", "page-title") list = soup.find_all('div', {'class': 'list_left'}) for park in list: name = str(park.find('h3').string) type = str(park.find('h2').string) loc = str(park.find('h4').string) des = str(park.find('p').string) des = des.replace('\n', ' ') des = des.replace('"', "'") state = state.string row_string = '"{}","{}","{}","{}","{}"'.format( name, type, loc, des, state) new_file.write(row_string) new_file.write('\n') new_file.close() ##### Save all States info and save as a csv new_state_file = open('states.csv', 'w', encoding='utf8') new_state_file.write('state,abbreviation,url') new_state_file.write('\n') for states in state_lst: ##### Cache by states name = states.split("/") abbr = name[2].upper() url = "https://www.nps.gov" + states data = requests.get(url).text soup = BeautifulSoup(data, "html.parser") ##### Scrap state's name and all parks state = soup.find("h1", "page-title") list = soup.find_all('div', {'class': 'list_left'}) state_name = "" for park in list: state = state.string row_string = '"{}","{}","{}"'.format(state, abbr, url) new_state_file.write(row_string) new_state_file.write('\n') new_state_file.close()
from advanced_expiry_caching import Cache # use tool from the other file for caching import requests, os from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.keys import Keys from sqlalchemy.orm import relationship import time ##########scraping data from the website: states, topics, activities FILENAME = "allinfo_parks.json" # saved in variable with convention of all-caps constant program_cache = Cache( FILENAME) # create a cache -- stored in a file of this name url = "https://www.nps.gov/findapark/advanced-search.htm?p=1&v=0" #url can act as identifier for caching in a scraping situation -- it IS frequently unique here, unlike in query requests data = program_cache.get(url) if not data: data = requests.get(url).text program_cache.set(url, data, expire_in_days=1) soup = BeautifulSoup( data, "html.parser" ) # html.parser string argument tells BeautifulSoup that it should work in the nice html way states = soup.find_all(id="form-park") activities = soup.find_all(id="form-activity") topics = soup.find_all(id="form-topic") states_name = [] for state in states: b = state.find_all('option') for i in range(len(b)):
FILENAME = "dogs_cache.json" program_cache = Cache(FILENAME) url = "https://www.petwave.com/Dogs/Breeds.aspx" data = requests.get(url).text soup = BeautifulSoup(data, features="html.parser") # print(soup.prettify()) # nice for investigation all_urls = soup.findAll('div', attrs={'class': 'pw-rid-small-headline'}) for url in all_urls: links = url.findAll('a') for a in links: new_url = "https://www.petwave.com" + a['href'] #cache all the urls into a json file data = program_cache.get(new_url) # print(new_url) if not data: data = requests.get(new_url).text program_cache.set(new_url, data) try: cache_file = open(FILENAME, 'r') cache_contents = cache_file.read() cache_diction = json.loads(cache_contents) cache_file.close() except: cache_diction = {} names_list = [] description_list = []