def handle_list_item(self, item): photo_url = item.xpath('./img/@src')[0] url = item.xpath('.//h5/a/@href')[0] name_text = item.xpath('.//h5/a/b/text()')[0] name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './div/text()[normalize-space()]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address, note='capitol') rep.add_contact_detail(type='voice', value=phone, note='capitol') rep.add_contact_detail(type='email', value=email, note='capitol') rep.add_source(self.url) yield rep
def handle_list_item(self, item): photo_url = item.xpath('./td[1]/a/img/@src')[0] info_nodes = item.xpath('./td[2]/p/a') name_text = info_nodes[0].xpath('./b/text()')[0] url = info_nodes[0].get('href') name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath( './td[2]/p/text()[normalize-space() and preceding-sibling::br]' ) if x.strip() ] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address, note='capitol') rep.add_contact_detail(type='voice', value=phone, note='capitol') rep.add_contact_detail(type='email', value=email, note='capitol') rep.add_source(self.url) yield rep
def handle_list_item(self, item): photo_url = item.xpath("./img/@src")[0] url = item.xpath(".//h5/a/@href")[0] name_text = item.xpath(".//h5/a/b/text()")[0] name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip("0").upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath("./div/text()[normalize-space()]") if x.strip() ] address = "\n".join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip() if validate_email_address(email_text): email = email_text rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=photo_url, ) rep.add_link(url) rep.add_contact_detail(type="address", value=address, note="capitol") rep.add_contact_detail(type="voice", value=phone, note="capitol") rep.add_contact_detail(type="email", value=email, note="capitol") rep.add_source(self.url) yield rep
def handle_list_item(self, item): photo_url = item.xpath('./td[1]/a/img/@src')[0] info_nodes = item.xpath('./td[2]/p/a') name_text = info_nodes[0].xpath('./b/text()')[0] url = info_nodes[0].get('href') name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './td[2]/p/text()[normalize-space() and preceding-sibling::br]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address) rep.add_contact_detail(type='voice', value=phone) rep.add_contact_detail(type='email', value=email) rep.add_source(self.url) yield rep
def _scrape_lower_chamber(self, term): url = 'http://www.house.leg.state.mn.us/members/hmem.asp' page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr') for legislator_node in legislator_nodes: photo_url = self.get_node( legislator_node, './td[1]/a/img/@src') info_nodes = self.get_nodes( legislator_node, './td[2]/p/a') name_text = self.get_node( info_nodes[0], './b/text()') name_match = re.search(r'^.+\(', name_text) name = name_match.group(0) name = name.replace('(', '').strip() district_match = re.search(r'\([0-9]{2}[A-Z]', name_text) district_text = district_match.group(0) district = district_text.replace('(', '').lstrip('0').strip() party_match = re.search(r'[A-Z]+\)$', name_text) party_text = party_match.group(0) party_text = party_text.replace(')', '').strip() party = self._parties[party_text] info_texts = self.get_nodes( legislator_node, './td[2]/p/text()[normalize-space() and preceding-sibling' '::br]') address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text legislator = Legislator( term=term, chamber='lower', district=district, full_name=name, party=party, email=email, photo_url=photo_url, ) legislator.add_source(url) legislator.add_office( type='capitol', name="Capitol Office", address=address, phone=phone, email=email, ) self.save_legislator(legislator)
def scrape_lower_chamber(self, term): # E-mail contact is now hidden behind webforms. Sadness. party_map = {'PNP': 'Partido Nuevo Progresista', 'PPD': u'Partido Popular Democr\xe1tico', 'PIP': u'Partido Independentista Puertorrique\u00F1o', } url = 'http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx' page = self.lxmlize(url) member_nodes = self.get_nodes(page, '//li[@class="selectionRep"]') for member_node in member_nodes: member_info = member_node.text_content().strip().split("\n") name = re.sub(r'^Hon\.', '', member_info[0]).strip() district_text = member_info[-1].strip() if district_text == 'Representante por Acumulación': district = 'At-Large' else: district = district_text.replace("Representante del Distrito ", "").strip() photo_url = self.get_node(member_node, './/img/@src') rep_link = self.get_node(member_node, ".//a/@href") rep_page = self.lxmlize(rep_link) party_node = self.get_node(rep_page, '//span[@class="partyBio"]') # Albelo doesn't seem to have a "partyBio" as an independent, but we # expect this to exist for all other members. if not party_node and name == "Manuel A. Natal Albelo": party = "Independent" else: party_text = party_node.text_content().strip() party = party_map[party_text] address = self.get_node(rep_page, '//h6').text.strip().split("\n")[0].strip() # Only grabs the first validated phone number found. # Typically, representatives have multiple phone numbers. phone_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Tel.")]') phone = None possible_phones = phone_node.text.strip().split("\n") for phone_attempt in possible_phones: # Don't keep searching phone numbers if a good one is found. if phone: break phone_text = re.sub(r'^Tel\.[\s]*', '', phone_attempt).strip() if validate_phone_number(phone_text): phone = phone_text fax_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Fax.")]') fax = None if fax_node: fax_text = fax_node.text.strip() fax_text = re.sub(r'^Fax\.[\s]*', '', fax_text).strip() if validate_phone_number(fax_text): fax = fax_text person = Person(primary_org='lower', district=district, name=name, party=party, image=photo_url) person.add_link(rep_link) person.add_source(rep_link) person.add_source(url) if address: person.add_contact_detail(type='address', value=address, note='Capitol Office') if phone: person.add_contact_detail(type='voice', value=phone, note='Capitol Office') if fax: person.add_contact_detail(type='fax', value=fax, note='Capitol Office') yield person
def scrape_lower_chamber(self, term): # E-mail contact is now hidden behind webforms. Sadness. party_map = { "PNP": "Partido Nuevo Progresista", "PPD": u"Partido Popular Democr\xe1tico", "PIP": u"Partido Independentista Puertorrique\u00F1o", } url = "http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx" page = self.lxmlize(url) member_nodes = self.get_nodes(page, '//li[@class="selectionRep"]') for member_node in member_nodes: member_info = member_node.text_content().strip().split("\n") name = re.sub(r"^Hon\.", "", member_info[0]).strip() district_text = member_info[-1].strip() if district_text == "Representante por Acumulación": district = "At-Large" else: district = district_text.replace("Representante del Distrito ", "").strip() photo_url = self.get_node(member_node, ".//img/@src") rep_link = self.get_node(member_node, ".//a/@href") rep_page = self.lxmlize(rep_link) party_node = self.get_node(rep_page, '//span[@class="partyBio"]') # Albelo doesn't seem to have a "partyBio" as an independent, but we # expect this to exist for all other members. if not party_node and name == "Manuel A. Natal Albelo": party = "Independent" else: party_text = party_node.text_content().strip() party = party_map[party_text] address = (self.get_node( rep_page, "//h6").text.strip().split("\n")[0].strip()) # Only grabs the first validated phone number found. # Typically, representatives have multiple phone numbers. phone_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Tel.")]') phone = None possible_phones = phone_node.text.strip().split("\n") for phone_attempt in possible_phones: # Don't keep searching phone numbers if a good one is found. if phone: break phone_text = re.sub(r"^Tel\.[\s]*", "", phone_attempt).strip() if validate_phone_number(phone_text): phone = phone_text fax_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Fax.")]') fax = None if fax_node: fax_text = fax_node.text.strip() fax_text = re.sub(r"^Fax\.[\s]*", "", fax_text).strip() if validate_phone_number(fax_text): fax = fax_text person = Person( primary_org="lower", district=district, name=name, party=party, image=photo_url, ) person.add_link(rep_link) person.add_source(rep_link) person.add_source(url) if address: person.add_contact_detail(type="address", value=address, note="Capitol Office") if phone: person.add_contact_detail(type="voice", value=phone, note="Capitol Office") if fax: person.add_contact_detail(type="fax", value=fax, note="Capitol Office") yield person
def _scrape_lower_chamber(self, term): url = 'http://www.house.leg.state.mn.us/members/hmem.asp' page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr') for legislator_node in legislator_nodes: photo_url = self.get_node( legislator_node, './td[1]/a/img/@src') info_nodes = self.get_nodes( legislator_node, './td[2]/p/a') name_text = self.get_node( info_nodes[0], './b/text()') name_match = re.search(r'^.+\(', name_text) name = name_match.group(0) name = name.replace('(', '').strip() district_match = re.search(r'\([0-9]{2}[A-Z]', name_text) district_text = district_match.group(0) district = district_text.replace('(', '').lstrip('0').strip() party_match = re.search(r'[A-Z]+\)$', name_text) party_text = party_match.group(0) party_text = party_text.replace(')', '').strip() party = self._parties[party_text] info_texts = self.get_nodes( legislator_node, './td[2]/p/text()[normalize-space() and preceding-sibling' '::br]') address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text legislator = Legislator( term=term, chamber='lower', district=district, full_name=name, party=party, email=email, photo_url=photo_url, ) legislator.add_source(url) legislator.add_office( type='capitol', name="Capitol Office", address=address, phone=phone, email=email, ) self.save_legislator(legislator)
def legislators(self, latest_only): legs = {} for member, chamber, term, url in self._memberships(latest_only): name, _, _, district, party = member.xpath('td') district = district.text detail_url = name.xpath('a/@href')[0] if party.text_content().strip() == "": self.warning("Garbage party: Skipping!") continue party = {'D': 'Democratic', 'R': 'Republican', 'I': 'Independent'}[party.text] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): name = name.strip('*') continue name = AKA.get(name, name) if name in legs: p, terms = legs[name] terms.append((chamber, district, term, party)) else: p = Person(name, party=party) legs[name] = p, [(chamber, district, term, party)] p.add_source(url) p.add_source(detail_url) p.add_link(detail_url) birth_date = BIRTH_DATES.get(name, None) if birth_date: p.birth_date = birth_date leg_html = self.get(detail_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(detail_url) hotgarbage = ( 'Senate Biography Information for the 98th General ' 'Assembly is not currently available.') if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning('No legislator bio available for ' + name) continue photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0] p.image = photo_url p.contact_details = [] # email email = leg_doc.xpath('//b[text()="Email: "]') if email: p.add_contact_detail(type='email', value=email[0].tail.strip(), note='capitol') offices = {'capitol': '//table[contains(string(), "Springfield Office")]', 'district': '//table[contains(string(), "District Office")]'} for location, xpath in offices.items(): table = leg_doc.xpath(xpath) if table: for type, value in self._table_to_office(table[3]): if type in ('fax', 'voice') and not validate_phone_number(value): continue p.add_contact_detail(type=type, value=value, note=location) return legs
def legislators(self, latest_only): legs = {} for member, chamber, term, url in self._memberships(latest_only): name, _, _, district, party = member.xpath("td") district = district.text detail_url = name.xpath("a/@href")[0] if party.text_content().strip() == "": self.warning("Garbage party: Skipping!") continue party = { "D": "Democratic", "R": "Republican", "I": "Independent" }[party.text] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith("*"): name = name.strip("*") continue name = AKA.get(name, name) if name in legs: p, terms = legs[name] terms.append((chamber, district, term, party)) else: p = Person(name, party=party) legs[name] = p, [(chamber, district, term, party)] p.add_source(url) p.add_source(detail_url) p.add_link(detail_url) birth_date = BIRTH_DATES.get(name, None) if birth_date: p.birth_date = birth_date leg_html = self.get(detail_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(detail_url) hotgarbage = ("Senate Biography Information for the 98th General " "Assembly is not currently available.") if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning("No legislator bio available for " + name) continue photo_url = leg_doc.xpath( '//img[contains(@src, "/members/")]/@src')[0] p.image = photo_url p.contact_details = [] # email email = leg_doc.xpath('//b[text()="Email: "]') if email: p.add_contact_detail(type="email", value=email[0].tail.strip(), note="capitol") offices = { "capitol": '//table[contains(string(), "Springfield Office")]', "district": '//table[contains(string(), "District Office")]', } for location, xpath in offices.items(): table = leg_doc.xpath(xpath) if table: for type, value in self._table_to_office(table[3]): if type in ("fax", "voice" ) and not validate_phone_number(value): continue p.add_contact_detail(type=type, value=value, note=location) return legs
def scrape_lower_chamber(self, term): # E-mail contact is now hidden behind webforms. Sadness. party_map = {'PNP': 'Partido Nuevo Progresista', 'PPD': u'Partido Popular Democr\xe1tico', 'PIP': u'Partido Independentista Puertorrique\u00F1o', } url = 'http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx' page = self.lxmlize(url) member_nodes = self.get_nodes(page, '//li[@class="selectionRep"]') for member_node in member_nodes: member_info = member_node.text_content().strip().split("\n") name = re.sub(r'^Hon\.', '', member_info[0]).strip() district_text = member_info[-1].strip() if district_text == 'Representante por Acumulación': district = 'At-Large' else: district = district_text.replace("Representante del Distrito ", "").strip() photo_url = self.get_node(member_node, './/img/@src') rep_link = self.get_node(member_node, ".//a/@href") rep_page = self.lxmlize(rep_link) party_node = self.get_node(rep_page, '//span[@class="partyBio"]') party_text = party_node.text_content().strip() party = party_map[party_text] address = self.get_node(rep_page, '//h6').text.strip().split("\n")[0].strip() # Only grabs the first validated phone number found. # Typically, representatives have multiple phone numbers. phone_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Tel.")]') phone = None possible_phones = phone_node.text.strip().split("\n") for phone_attempt in possible_phones: # Don't keep searching phone numbers if a good one is found. if phone: break phone_text = re.sub(r'^Tel\.[\s]*', '', phone_attempt).strip() if validate_phone_number(phone_text): phone = phone_text fax_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Fax.")]') fax = None if fax_node: fax_text = fax_node.text.strip() fax_text = re.sub(r'^Fax\.[\s]*', '', fax_text).strip() if validate_phone_number(fax_text): fax = fax_text person = Person(primary_org='lower', district=district, name=name, party=party, image=photo_url) person.add_link(rep_link) person.add_source(rep_link) person.add_source(url) if address: person.add_contact_detail(type='address', value=address, note='Capitol Office') if phone: person.add_contact_detail(type='voice', value=phone, note='Capitol Office') if fax: person.add_contact_detail(type='fax', value=fax, note='Capitol Office') yield person
def _scrape_lower_chamber(self, term): url = 'http://www.house.leg.state.mn.us/members/hmem.asp' page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr') need_special_email_case = False for legislator_node in legislator_nodes: photo_url = self.get_node( legislator_node, './td[1]/a/img/@src') info_nodes = self.get_nodes( legislator_node, './td[2]/p/a') name_text = self.get_node( info_nodes[0], './b/text()') name_match = re.search(r'^.+\(', name_text) name = name_match.group(0) name = name.replace('(', '').strip() district_match = re.search(r'\([0-9]{2}[A-Z]', name_text) district_text = district_match.group(0) district = district_text.replace('(', '').lstrip('0').strip() party_match = re.search(r'[A-Z]+\)$', name_text) party_text = party_match.group(0) party_text = party_text.replace(')', '').strip() party = self._parties[party_text] info_texts = self.get_nodes( legislator_node, './td[2]/p/text()[normalize-space() and preceding-sibling' '::br]') address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text # E-mail markup is screwed-up and inconsistent. try: email_node = info_nodes[1] email_text = email_node.text except IndexError: # Primarily for Dan Fabian. email_node = info_texts[3] need_special_email_case = True email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text legislator = Legislator( term=term, chamber='lower', district=district, full_name=name, party=party, email=email, photo_url=photo_url, ) legislator.add_source(url) legislator.add_office( type='capitol', name="Capitol Office", address=address, phone=phone, email=email, ) self.save_legislator(legislator) if not need_special_email_case: self.logger.warning('Special e-mail handling no longer required.')
def _scrape_lower_chamber(self, term): url = 'http://www.house.leg.state.mn.us/members/hmem.asp' page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr') need_special_email_case = False for legislator_node in legislator_nodes: photo_url = self.get_node(legislator_node, './td[1]/a/img/@src') info_nodes = self.get_nodes(legislator_node, './td[2]/p/a') name_text = self.get_node(info_nodes[0], './b/text()') name_match = re.search(r'^.+\(', name_text) name = name_match.group(0) name = name.replace('(', '').strip() district_match = re.search(r'\([0-9]{2}[A-Z]', name_text) district_text = district_match.group(0) district = district_text.replace('(', '').lstrip('0').strip() party_match = re.search(r'[A-Z]+\)$', name_text) party_text = party_match.group(0) party_text = party_text.replace(')', '').strip() party = self._parties[party_text] info_texts = self.get_nodes( legislator_node, './td[2]/p/text()[normalize-space() and preceding-sibling' '::br]') address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text # E-mail markup is screwed-up and inconsistent. try: email_node = info_nodes[1] email_text = email_node.text except IndexError: # Primarily for Dan Fabian. email_node = info_texts[3] need_special_email_case = True email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text legislator = Legislator( term=term, chamber='lower', district=district, full_name=name, party=party, email=email, photo_url=photo_url, ) legislator.add_source(url) legislator.add_office( type='capitol', name="Capitol Office", address=address, phone=phone, email=email, ) self.save_legislator(legislator) if not need_special_email_case: self.logger.warning('Special e-mail handling no longer required.')