Esempio n. 1
0
from get_pages import Page
import json

soup = Page().getSoup()
states = soup.select('ul.states a')
urls = {}

for state in states:
	soup = Page(state['href']).getSoup()
	state_name = state.getText().strip()
	site_urls = {}
	sites = soup.select('div.content a')
	sites.pop(0)
	for site in sites:
		site_urls[site.getText().strip()] = site['href']
	urls[state_name] = site_urls

with open('urls.json', 'w') as outfile:
    json.dump(urls, outfile)


Esempio n. 2
0
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from living_wage_database_setup import Base, Location, Wages, Salaries
import os

engine = create_engine(os.environ.get('DATABASE_URL'))
Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()

def add(row):
	session.add(row)
	session.commit()

def addLocation(state, site, comma_index):
	if comma_index != -1:
		site = site[:comma_index]
	location = Location(state = state, site = site)
	add(location)
	return location.id

with open('urls.json') as j:

	urls = json.load(j)

	for state in urls:
		for site, url in urls[state].iteritems():
			loc_id = addLocation(state, site, site.find(','))
			soup = Page(url).getSoup()
			add(Wages(map(lambda w: int(w.getText().strip().replace('$','').replace(',','')), soup.select('.expenses_table td')[105:]), loc_id))
			add(Salaries(map(lambda s: int(s.getText().strip().replace('$','').replace(',','')), soup.select('.occupations_table td')[1::2]), loc_id))