# Some of the elements have multiple lines (with email address # on the 2nd line. Take only the first line. s = el.text_content().split('\n')[0].strip() if not s: continue # On some occasions, the comma between the first and last name is # missing. Work around that by replacing the first space with the comma. if s.count(',') < 2: s = s.replace(' ', ', ', 1) last, first, party = s.split(',') # Clean up extra spaces first = first.strip() # Cut everything after the first space or period party = party.strip().split('.')[0].split(' ')[0].replace(':', '') # Canonize party abbreviation if party.lower() == 'peruss': party = 'PS' for p in PARTIES: if p.startswith(party): party = p break else: raise Exception("Unknown party: %s" % party) members.append(('%s %s' % (first, last), party)) submit_council_members("Helsinki", members)
# on the 2nd line. Take only the first line. s = el.text_content().split('\n')[0].strip() if not s: continue # On some occasions, the comma between the first and last name is # missing. Work around that by replacing the first space with the comma. if s.count(',') < 2: s = s.replace(' ', ', ', 1) last, first, party = s.split(',') # Clean up extra spaces first = first.strip() # Cut everything after the first space or period party = party.strip().split('.')[0].split(' ')[0].replace(':', '') # Canonize party abbreviation if party.lower() == 'peruss': party = 'PS' for p in PARTIES: if p.startswith(party): party = p break else: raise Exception("Unknown party: %s" % party) members.append(('%s %s' % (first, last), party)) submit_council_members("Helsinki", members)
else: name = el.tail name = name.strip() members.append((name, party)) return members requests_cache.configure('jyvaskyla') members = [] BASE_URL = 'http://www.jyvaskyla.fi/hallinto/valtuusto/valtuusto09' r = requests.get(BASE_URL) doc = html.fromstring(r.text) # We will be fetching linked pages, so relative paths must be # convert into absolute URLs. doc.make_links_absolute(BASE_URL) # Find the p element that contains the text "Valtuustoryhmät" el = doc.xpath(u"//h2[contains(., 'Valtuustoryhmät')]")[0] # The links to the council groups follow party_links = el.xpath("following-sibling::p/a") for link_el in party_links: url = link_el.attrib['href'] ret = scrape_council_group(url) members += ret # The city has exactly 75 council members assert len(members) == 75 submit_council_members("Jyväskylä", members)