def scrape(self): board_html = self.urlopen(self.COUNTY_BOARD_URL) board_lxml = lxml.html.fromstring(board_html) board_lxml.make_links_absolute(base_url=self.COUNTY_BOARD_URL) for board_member_lxml in board_lxml.cssselect("div[name=cbo_list] div[name=row]"): name = board_member_lxml.cssselect("div[name=info] strong")[0].text.strip() image = board_member_lxml.cssselect("div[name=pictures] img")[0].get('src') pieces = re.split(r'<br\s*\/?>', lxml.html.tostring(board_member_lxml.cssselect("div[name=info]")[0]).decode(), re.I) position = re.sub(r'<[^>]*>', '', pieces[1]).strip() links = board_member_lxml.cssselect("div[name=info] a") email = bio_link = None for link in links: if link.text is None: continue if 'arlingtonva.us' in link.text.lower(): email = re.sub(r'\s*\(at\)\s*','@', link.text).strip() elif 'bio' in link.text.lower(): bio_link = link legislator = Legislator(name=name, district=position, image=image) legislator.add_contact(type='email', value=email, note='%(name)s email address' % {'name': name} ) legislator.add_source(self.COUNTY_BOARD_URL) bio = None if bio_link is not None: bio_href = bio_link.attrib.get('href') bio_html = self.urlopen(bio_href) bio_lxml = lxml.html.fromstring(bio_html) bio_text = re.sub(r'<[^>]*>', '', lxml.html.tostring(bio_lxml.cssselect('#textSection #text')[0]).decode(), re.I).strip() bio_text = re.sub(r' ', ' ', bio_text) legislator.biography = bio_text legislator.add_link('bio page', bio_href) yield legislator
def get_people(self): board_html = self.urlopen(self.COUNTY_BOARD_URL) board_lxml = lxml.html.fromstring(board_html) board_lxml.make_links_absolute(base_url=self.COUNTY_BOARD_URL) for board_member_lxml in board_lxml.cssselect( "div[name=cbo_list] div[name=row]"): name = board_member_lxml.cssselect( "div[name=info] strong")[0].text.strip() image = board_member_lxml.cssselect( "div[name=pictures] img")[0].get('src') position = re.sub( r'<[^>]*>', '', re.split( r'<br\s*\/?>', lxml.html.tostring( board_member_lxml.cssselect("div[name=info]")[0]), re.I)[1]).strip() links = board_member_lxml.cssselect("div[name=info] a") email = bio_link = None for link in links: if link.text is None: continue if 'arlingtonva.us' in link.text.lower(): email = re.sub(r'\s*\(at\)\s*', '@', link.text).strip() elif 'bio' in link.text.lower(): bio_link = link legislator = Legislator(name=name, post_id=position, image=image) legislator.add_contact(type='email', value=email, note='%(name)s email address' % {'name': name}) legislator.add_source(self.COUNTY_BOARD_URL) bio = None if bio_link is not None: bio_href = bio_link.attrib.get('href') bio_html = self.urlopen(bio_href) bio_lxml = lxml.html.fromstring(bio_html) bio_text = re.sub( r'<[^>]*>', '', lxml.html.tostring( bio_lxml.cssselect('#textSection #text')[0]), re.I).strip() bio_text = re.sub(r' ', ' ', bio_text) legislator.biography = bio_text legislator.add_link('bio page', bio_href) yield legislator
def get_people(self): # committee tech = Organization('Technology', classification='committee') tech.add_post('Chairman', 'chairman') tech.add_source('https://example.com') yield tech # subcommittee ecom = Organization('Subcommittee on E-Commerce', parent=tech, classification='committee') ecom.add_source('https://example.com') yield ecom p = Legislator('Paul Tagliamonte', '6') p.add_membership(tech, role='chairman') p.add_source('https://example.com') yield p
def people_to_pupa(stream, transaction): org = Organization( name=transaction.jurisdiction.name, classification='legislature', ) for row in stream: # XXX: Validate the row better. name = row.get("Name", "").strip() district = row.get("District", "").strip() if not name: raise ValueError("A name is required for each entry.") if not district: raise ValueError("A district is required for each entry.") obj = Legislator(name=name, district=district) org.add_post(label=district, role="member") for key, keys in [ ("email", ("Email 1", "Email 2", "Email 3")), ("address", ("Address 1", "Address 2", "Address 3")), ("voice", ("Phone 1", "Phone 2", "Phone 3")), ]: for k in keys: value = row.get(k) if value: obj.add_contact_detail(type=key, value=value, note=k) obj.add_source(url=OCD_SOURCE_URL) obj.validate() obj.pre_save(transaction.jurisdiction.id) yield obj for related in obj._related: yield related for related in org._related: yield related yield org
def get_people(self): for councilman, committees in self.councilMembers(): contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("phone", "City Hall Phone"), "Ward Office Phone": ("phone", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } contacts = [] for contact_type, (_type, note) in contact_types.items(): if councilman[contact_type]: contacts.append({ "type": _type, "value": councilman[contact_type], "note": note }) if councilman["E-mail"]: contacts.append({ "type": "email", "value": councilman['E-mail']['label'], 'note': 'E-mail' }) p = Legislator(councilman['Person Name']['label'], post_id="Ward %s" % (councilman['Ward/Office']), image=councilman['Photo'], contact_details=contacts) if councilman['Website']: p.add_link('homepage', councilman['Website']['url']) p.add_source(MEMBERLIST) for committee, _, _ in committees: if committee['Legislative Body']['label']: print(committee) if committee['Legislative Body']['label'] not in ( 'City Council', 'Office of the Mayor'): p.add_committee_membership( committee['Legislative Body']['label'], role=committee["Title"]) yield p
def people_to_pupa(stream, transaction): org = Organization(name=transaction.jurisdiction.name, classification="legislature") for row in stream: # XXX: Validate the row better. name = row.get("Name", "").strip() district = row.get("District", "").strip() if not name: raise ValueError("A name is required for each entry.") if not district: raise ValueError("A district is required for each entry.") obj = Legislator(name=name, district=district) org.add_post(label=district, role="member") for key, keys in [ ("email", ("Email 1", "Email 2", "Email 3")), ("address", ("Address 1", "Address 2", "Address 3")), ("voice", ("Phone 1", "Phone 2", "Phone 3")), ]: for k in keys: value = row.get(k) if value: obj.add_contact_detail(type=key, value=value, note=k) obj.add_source(url=OCD_SOURCE_URL) obj.validate() obj.pre_save(transaction.jurisdiction.id) yield obj for related in obj._related: yield related for related in org._related: yield related yield org
def get_people(self): for councilman, committees in self.councilMembers() : contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("phone", "City Hall Phone"), "Ward Office Phone": ("phone", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } contacts = [] for contact_type, (_type, note) in contact_types.items () : if councilman[contact_type] : contacts.append({"type": _type, "value": councilman[contact_type], "note": note}) if councilman["E-mail"] : contacts.append({"type" : "email", "value" : councilman['E-mail']['label'], 'note' : 'E-mail'}) p = Legislator(councilman['Person Name']['label'], post_id="Ward %s" % (councilman['Ward/Office']), image=councilman['Photo'], contact_details = contacts) if councilman['Website'] : p.add_link('homepage', councilman['Website']['url']) p.add_source(MEMBERLIST) for committee, _, _ in committees : if committee['Legislative Body']['label'] : print(committee) if committee['Legislative Body']['label'] not in ('City Council', 'Office of the Mayor') : p.add_committee_membership(committee['Legislative Body']['label'], role= committee["Title"]) yield p