コード例 #1
0
def scrapeNPS():
    ##### CACHE

    FILENAME = "nps_cache.json"
    program_cache = Cache(FILENAME)

    url = "https://www.nps.gov" + "/index.htm"
    data = program_cache.get(url)

    if not data:
        data = requests.get(url).text
        program_cache.set(url, data, expire_in_days=1)

    soup = BeautifulSoup(data, "html.parser")

    ##### Get all state links
    state_lst = []
    for link in soup.find_all('a'):
        if '/state/' in link['href']:
            # print(link['href'])
            state_lst.append(link['href'])

    ##### Creating a new CSV called 'park_info'
    new_file = open('park_info.csv', 'w', encoding='utf8')
    new_file.write('name,type,location,description,state')
    new_file.write('\n')
    for states in state_lst:

        ##### Cache by states
        name = states.split("/")
        cache_each_state = "nps_cache_" + name[2] + ".json"
        program_cache = Cache(cache_each_state)
        url = "https://www.nps.gov" + states
        data = program_cache.get(url)

        if not data:
            data = requests.get(url).text
            program_cache.set(url, data, expire_in_days=1)
        soup = BeautifulSoup(data, "html.parser")

        ##### Scrap state's name and all parks
        state = soup.find("h1", "page-title")
        list = soup.find_all('div', {'class': 'list_left'})

        for park in list:
            name = str(park.find('h3').string)
            type = str(park.find('h2').string)
            loc = str(park.find('h4').string)
            des = str(park.find('p').string)
            des = des.replace('\n', ' ')
            des = des.replace('"', "'")
            state = state.string

            row_string = '"{}","{}","{}","{}","{}"'.format(
                name, type, loc, des, state)
            new_file.write(row_string)
            new_file.write('\n')

    new_file.close()

    ##### Save all States info and save as a csv
    new_state_file = open('states.csv', 'w', encoding='utf8')
    new_state_file.write('state,abbreviation,url')
    new_state_file.write('\n')

    for states in state_lst:

        ##### Cache by states
        name = states.split("/")
        abbr = name[2].upper()
        url = "https://www.nps.gov" + states
        data = requests.get(url).text

        soup = BeautifulSoup(data, "html.parser")

        ##### Scrap state's name and all parks
        state = soup.find("h1", "page-title")
        list = soup.find_all('div', {'class': 'list_left'})

        state_name = ""
        for park in list:
            state = state.string

        row_string = '"{}","{}","{}"'.format(state, abbr, url)
        new_state_file.write(row_string)
        new_state_file.write('\n')

    new_state_file.close()
コード例 #2
0
from advanced_expiry_caching import Cache  # use tool from the other file for caching
import requests, os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from sqlalchemy.orm import relationship
import time

##########scraping data from the website: states, topics, activities
FILENAME = "allinfo_parks.json"  # saved in variable with convention of all-caps constant
program_cache = Cache(
    FILENAME)  # create a cache -- stored in a file of this name

url = "https://www.nps.gov/findapark/advanced-search.htm?p=1&v=0"  #url can act as identifier for caching in a scraping situation -- it IS frequently unique here, unlike in query requests

data = program_cache.get(url)
if not data:
    data = requests.get(url).text
    program_cache.set(url, data, expire_in_days=1)

soup = BeautifulSoup(
    data, "html.parser"
)  # html.parser string argument tells BeautifulSoup that it should work in the nice html way
states = soup.find_all(id="form-park")
activities = soup.find_all(id="form-activity")
topics = soup.find_all(id="form-topic")

states_name = []
for state in states:
    b = state.find_all('option')
    for i in range(len(b)):
コード例 #3
0
FILENAME = "dogs_cache.json"
program_cache = Cache(FILENAME)

url = "https://www.petwave.com/Dogs/Breeds.aspx"
data = requests.get(url).text
soup = BeautifulSoup(data, features="html.parser")
# print(soup.prettify()) # nice for investigation

all_urls = soup.findAll('div', attrs={'class': 'pw-rid-small-headline'})
for url in all_urls:
    links = url.findAll('a')
    for a in links:
        new_url = "https://www.petwave.com" + a['href']
        #cache all the urls into a json file
        data = program_cache.get(new_url)
        # print(new_url)
        if not data:
            data = requests.get(new_url).text
            program_cache.set(new_url, data)

try:
    cache_file = open(FILENAME, 'r')
    cache_contents = cache_file.read()
    cache_diction = json.loads(cache_contents)
    cache_file.close()
except:
    cache_diction = {}

names_list = []
description_list = []