mainpage = get_page('http://projects.propublica.org/docdollars/') majorsoup = BeautifulSoup(mainpage) States = [] for td in majorsoup.find_all("td",class_="label",style=False)[1:-1]: States.append(td.string) g = Graph() state = BNode() clinic = BNode() company = BNode() transaction = BNode() payment = BNode() for state in States: try: page = get_page('http://projects.propublica.org/docdollars/states/'+state.replace(' ','-')) soup = BeautifulSoup(page) state_txns = soup.find_all('tr') #find all transactions except urllib2.HTTPError: print state+' not available' continue def track(tag): #get payment information #clinic info clinic_name = tag[0].find(class_=False).string DDsite = tag[0].find(href=True)['href'] #Propublica's docdollars site city = tag[1].string.strip() #city state = soup.find("strong").string #state g.add( (clinic, RDF.type, FOAF.Organization) ) g.add( (clinic, FOAF.name, Literal(clinic_name)) )