def handle_list_item(self, item): photo_url = item.xpath("./img/@src")[0] url = item.xpath(".//h5/a/@href")[0] name_text = item.xpath(".//h5/a/b/text()")[0] name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip("0").upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath("./div/text()[normalize-space()]") if x.strip() ] address = "\n".join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip() if validate_email_address(email_text): email = email_text rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=photo_url, ) rep.add_link(url) rep.add_contact_detail(type="address", value=address, note="capitol") rep.add_contact_detail(type="voice", value=phone, note="capitol") rep.add_contact_detail(type="email", value=email, note="capitol") rep.add_source(self.url) yield rep
def scrape_chamber(self, chamber=None): if chamber == "upper": url = "http://webserver.rilin.state.ri.us/Documents/Senators.xls" rep_type = "Senator" contact_url = ( "http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp" ) elif chamber == "lower": url = "http://webserver.rilin.state.ri.us/Documents/Representatives.xls" rep_type = "Representative" contact_url = ( "http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp" ) contact_page = self.lxmlize(contact_url) contact_info_by_district = {} for row in contact_page.xpath('//tr[@valign="TOP"]'): tds = row.xpath("td") (detail_link, ) = tds[link_col_ix].xpath(".//a/@href") # Ignore name (2nd col). We have a regex built up below for the spreadsheet name # I don't want to touch district, _, email, phone = [ td.text_content().strip() for td in tds[:link_col_ix] ] contact_info_by_district[district] = { "email": email, "phone": phone, "detail_link": detail_link, } self.urlretrieve(url, "ri_leg.xls") wb = xlrd.open_workbook("ri_leg.xls") sh = wb.sheet_by_index(0) for rownum in range(1, sh.nrows): d = { field: sh.cell(rownum, col_num).value for field, col_num in excel_mapping.items() } # Convert float to an int, and then to string, the required format district = str(int(d["district"])) if d["full_name"].upper() == "VACANT": self.warning("District {}'s seat is vacant".format(district)) continue contact_info = contact_info_by_district[district] # RI is very fond of First M. Last name formats and # they're being misparsed upstream, so fix here (first, middle, last) = ("", "", "") full_name = re.sub(r"^{}(?=\s?[A-Z].*$)".format(rep_type), "", d["full_name"]).strip() if re.match(r"^\S+\s[A-Z]\.\s\S+$", full_name): (first, middle, last) = full_name.split() # Note - if we ever need to speed this up, it looks like photo_url can be mapped # from the detail_link a la /senators/Paolino/ -> /senators/pictures/Paolino.jpg detail_page = self.lxmlize(contact_info["detail_link"]) (photo_url, ) = detail_page.xpath('//div[@class="ms-WPBody"]//img/@src') person = Person( primary_org=chamber, district=district, name=full_name, party=translate[d["party"]], image=photo_url, ) person.extras["town_represented"] = d["town_represented"] person.add_link(detail_link) if d["address"] and d["address"] != "-": person.add_contact_detail(type="address", value=d["address"], note="District Office") phone = contact_info["phone"] if phone and validate_phone_number(phone): person.add_contact_detail(type="voice", value=phone, note="District Office") email = contact_info["email"] if email and validate_email_address(email): person.add_contact_detail(type="email", value=email, note="District Office") person.add_source(contact_url) person.add_source(contact_info["detail_link"]) yield person
def legislators(self, latest_only): legs = {} for member, chamber, term, url in self._memberships(latest_only): name, _, _, district, party = member.xpath("td") district = district.text detail_url = name.xpath("a/@href")[0] if party.text_content().strip() == "": party = "Independent" else: party = {"D": "Democratic", "R": "Republican", "I": "Independent"}[ party.text ] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith("*"): name = name.strip("*") continue name = AKA.get(name, name) if name in legs: p, terms = legs[name] terms.append((chamber, district, term, party)) else: p = Person(name, party=party) legs[name] = p, [(chamber, district, term, party)] p.add_source(url) p.add_source(detail_url) p.add_link(detail_url) birth_date = BIRTH_DATES.get(name, None) if birth_date: p.birth_date = birth_date leg_html = self.get(detail_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(detail_url) hotgarbage = ( "Senate Biography Information for the 98th General " "Assembly is not currently available." ) if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning("No legislator bio available for " + name) continue photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0] p.image = photo_url p.contact_details = [] # email email = leg_doc.xpath('//b[text()="Email: "]') if email: p.add_contact_detail( type="email", value=email[0].tail.strip(), note="Capitol Office" ) offices = { "Capitol Office": '//table[contains(string(), "Springfield Office")]', "District Office": '//table[contains(string(), "District Office")]', } for location, xpath in offices.items(): table = leg_doc.xpath(xpath) if table: for type, value in self._table_to_office(table[3]): if type in ("fax", "voice") and not validate_phone_number( value ): continue p.add_contact_detail(type=type, value=value, note=location) return legs
def clean_alternative_phone_number(self): if self.cleaned_data['alternative_phone_number']: alt_phone_no = self.cleaned_data['alternative_phone_number'] validate_phone_number(alt_phone_no) return alt_phone_no
def clean_primary_phone_number(self): primary_phone_no = self.cleaned_data['primary_phone_number'] validate_phone_number(primary_phone_no) return primary_phone_no
def clean_contact_person_phone_number(self): number = self.cleaned_data['contact_person_phone_number'] validate_phone_number(number) return number
def scrape_lower_chamber(self, term): # E-mail contact is now hidden behind webforms. Sadness. party_map = { "PNP": "Partido Nuevo Progresista", "PPD": u"Partido Popular Democr\xe1tico", "PIP": u"Partido Independentista Puertorrique\u00F1o", } url = "http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx" page = self.lxmlize(url) member_nodes = self.get_nodes(page, '//li[@class="selectionRep"]') for member_node in member_nodes: member_info = member_node.text_content().strip().split("\n") name = re.sub(r"^Hon\.", "", member_info[0]).strip() district_text = member_info[-1].strip() if district_text == "Representante por AcumulaciĆ³n": district = "At-Large" else: district = district_text.replace("Representante del Distrito ", "").strip() photo_url = self.get_node(member_node, ".//img/@src") rep_link = self.get_node(member_node, ".//a/@href") rep_page = self.lxmlize(rep_link) party_node = self.get_node(rep_page, '//span[@class="partyBio"]') # Albelo doesn't seem to have a "partyBio" as an independent, but we # expect this to exist for all other members. if not party_node and name == "Manuel A. Natal Albelo": party = "Independent" else: party_text = party_node.text_content().strip() party = party_map[party_text] address = (self.get_node( rep_page, "//h6").text.strip().split("\n")[0].strip()) # Only grabs the first validated phone number found. # Typically, representatives have multiple phone numbers. phone_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Tel.")]') phone = None possible_phones = phone_node.text.strip().split("\n") for phone_attempt in possible_phones: # Don't keep searching phone numbers if a good one is found. if phone: break phone_text = re.sub(r"^Tel\.[\s]*", "", phone_attempt).strip() if validate_phone_number(phone_text): phone = phone_text fax_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Fax.")]') fax = None if fax_node: fax_text = fax_node.text.strip() fax_text = re.sub(r"^Fax\.[\s]*", "", fax_text).strip() if validate_phone_number(fax_text): fax = fax_text person = Person( primary_org="lower", district=district, name=name, party=party, image=photo_url, ) person.add_link(rep_link) person.add_source(rep_link) person.add_source(url) if address: person.add_contact_detail(type="address", value=address, note="Capitol Office") if phone: person.add_contact_detail(type="voice", value=phone, note="Capitol Office") if fax: person.add_contact_detail(type="fax", value=fax, note="Capitol Office") yield person
def _parse_office(self, office_node): """ Gets the contact information from the provided office. """ office_name_text = self.get_node(office_node, './/span[@itemprop="name"]/text()') if office_name_text is not None: office_name_text = office_name_text.strip() else: office_name_text = () # Initializing default values for office attributes. office_name = None office_type = None street_address = None city = None state = None zip_code = None address = None phone = None fax = None # Determine office names/types consistent with Open States internal # format. if "Albany Office" in office_name_text: office_name = "Capitol Office" office_type = "capitol" elif "District Office" in office_name_text: office_name = "District Office" office_type = "district" else: # Terminate if not a capitol or district office. return None # Get office street address. street_address_text = self.get_node( office_node, './/div[@class="street-address"][1]/' 'span[@itemprop="streetAddress"][1]/text()', ) if street_address_text is not None: street_address = street_address_text.strip() # Get office city. city_text = self.get_node(office_node, './/span[@class="locality"][1]/text()') if city_text is not None: city = city_text.strip().rstrip(",") # Get office state. state_text = self.get_node(office_node, './/span[@class="region"][1]/text()') if state_text is not None: state = state_text.strip() # Get office postal code. zip_code_text = self.get_node( office_node, './/span[@class="postal-code"][1]/text()') if zip_code_text is not None: zip_code = zip_code_text.strip() # Build office physical address. if (street_address is not None and city is not None and state is not None and zip_code is not None): address = "{}\n{}, {} {}".format(street_address, city, state, zip_code) else: address = None # Get office phone number. phone_node = self.get_node( office_node, './/div[@class="tel"]/span[@itemprop="telephone"]') if phone_node is not None: phone = phone_node.text.strip() # Get office fax number. fax_node = self.get_node( office_node, './/div[@class="tel"]/span[@itemprop="faxNumber"]') if fax_node is not None: fax = fax_node.text.strip() if not validate_phone_number(fax): fax = None office = dict(name=office_name, type=office_type, phone=phone, fax=fax, address=address) return office
def scrape_offices(self, legislator, doc): # Retrieve element that should contain all contact information for the # legislator and turn its text into a list. text = doc.xpath('//b[contains(., "Capitol Office:")]')[0] text = text.getparent().itertext() text = filter(None, [t.strip() for t in text]) # Parse capitol office contact details. officedata = defaultdict(list) current = None for chunk in text: # Skip parsing biography link. if chunk.lower() == "biography": break # Contact snippets should be elements with headers that end in # colons. if chunk.strip().endswith(":"): current_key = chunk.strip() current = officedata[current_key] elif current is not None: current.append(chunk) if current_key == "Business Phone:": break email = doc.xpath('//a[contains(@href, "mailto:")]/@href')[1] email = email[7:] try: if officedata["Capitol Phone:"][0] not in ("", "NA"): capitol_phone = officedata["Capitol Phone:"][0] else: raise ValueError("Invalid phone number") except (IndexError, ValueError): capitol_phone = None if officedata["Capitol Office:"]: capitol_address = "\n".join(officedata["Capitol Office:"]) else: capitol_address = None if email: legislator.add_contact_detail(type="email", value=email, note="Capitol Office") if capitol_phone and validate_phone_number(capitol_phone): legislator.add_contact_detail(type="voice", value=capitol_phone, note="Capitol Office") if capitol_address: legislator.add_contact_detail(type="address", value=capitol_address, note="Capitol Office") # If a business or home phone is listed, attempt to use the # home phone first, then fall back on the business phone for # the district office number. try: if officedata["Home Phone:"][0] not in ("", "NA"): district_phone = officedata["Home Phone:"][0] elif officedata["Business Phone:"][0] not in ("", "NA"): district_phone = officedata["Business Phone:"][0] else: raise ValueError("Invalid phone number") except (IndexError, ValueError): district_phone = None if officedata["Home:"]: district_address = "\n".join(officedata["Home:"]) else: district_address = None # Add district office entry only if data exists for it. if district_phone and validate_phone_number(district_phone): legislator.add_contact_detail(type="voice", value=district_phone, note="District Office") if district_address: legislator.add_contact_detail(type="address", value=district_address, note="District Office")