コード例 #1
0
ファイル: pharmasouptical.py プロジェクト: stymy/pharmasoup
mainpage = get_page('http://projects.propublica.org/docdollars/')
majorsoup = BeautifulSoup(mainpage)
States = []
for td in majorsoup.find_all("td",class_="label",style=False)[1:-1]:
    States.append(td.string)
g = Graph()
state = BNode()
clinic = BNode()
company = BNode()
transaction = BNode()
payment = BNode()

for state in States:
    try:
        page = get_page('http://projects.propublica.org/docdollars/states/'+state.replace(' ','-'))
        soup = BeautifulSoup(page)
        state_txns = soup.find_all('tr') #find all transactions
    except urllib2.HTTPError:
        print state+' not available'
        continue

    def track(tag): #get payment information
        #clinic info
        clinic_name = tag[0].find(class_=False).string
        DDsite = tag[0].find(href=True)['href'] #Propublica's docdollars site
        city = tag[1].string.strip() #city
        state = soup.find("strong").string #state

        g.add( (clinic, RDF.type, FOAF.Organization) )
        g.add( (clinic, FOAF.name, Literal(clinic_name)) )