Example #1
0
    def handle_list_item(self, item):
        photo_url = item.xpath("./img/@src")[0]
        url = item.xpath(".//h5/a/@href")[0]
        name_text = item.xpath(".//h5/a/b/text()")[0]

        name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip("0").upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip() for x in item.xpath("./div/text()[normalize-space()]")
            if x.strip()
        ]
        address = "\n".join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=photo_url,
        )
        rep.add_link(url)
        rep.add_contact_detail(type="address", value=address, note="capitol")
        rep.add_contact_detail(type="voice", value=phone, note="capitol")
        rep.add_contact_detail(type="email", value=email, note="capitol")
        rep.add_source(self.url)

        yield rep
Example #2
0
    def scrape_chamber(self, chamber=None):
        if chamber == "upper":
            url = "http://webserver.rilin.state.ri.us/Documents/Senators.xls"
            rep_type = "Senator"
            contact_url = (
                "http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp"
            )
        elif chamber == "lower":
            url = "http://webserver.rilin.state.ri.us/Documents/Representatives.xls"
            rep_type = "Representative"
            contact_url = (
                "http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp"
            )

        contact_page = self.lxmlize(contact_url)
        contact_info_by_district = {}
        for row in contact_page.xpath('//tr[@valign="TOP"]'):
            tds = row.xpath("td")
            (detail_link, ) = tds[link_col_ix].xpath(".//a/@href")
            # Ignore name (2nd col). We have a regex built up below for the spreadsheet name
            # I don't want to touch
            district, _, email, phone = [
                td.text_content().strip() for td in tds[:link_col_ix]
            ]
            contact_info_by_district[district] = {
                "email": email,
                "phone": phone,
                "detail_link": detail_link,
            }

        self.urlretrieve(url, "ri_leg.xls")

        wb = xlrd.open_workbook("ri_leg.xls")
        sh = wb.sheet_by_index(0)

        for rownum in range(1, sh.nrows):
            d = {
                field: sh.cell(rownum, col_num).value
                for field, col_num in excel_mapping.items()
            }

            # Convert float to an int, and then to string, the required format
            district = str(int(d["district"]))
            if d["full_name"].upper() == "VACANT":
                self.warning("District {}'s seat is vacant".format(district))
                continue

            contact_info = contact_info_by_district[district]

            # RI is very fond of First M. Last name formats and
            # they're being misparsed upstream, so fix here
            (first, middle, last) = ("", "", "")
            full_name = re.sub(r"^{}(?=\s?[A-Z].*$)".format(rep_type), "",
                               d["full_name"]).strip()
            if re.match(r"^\S+\s[A-Z]\.\s\S+$", full_name):
                (first, middle, last) = full_name.split()

            # Note - if we ever need to speed this up, it looks like photo_url can be mapped
            # from the detail_link a la /senators/Paolino/ -> /senators/pictures/Paolino.jpg
            detail_page = self.lxmlize(contact_info["detail_link"])
            (photo_url,
             ) = detail_page.xpath('//div[@class="ms-WPBody"]//img/@src')

            person = Person(
                primary_org=chamber,
                district=district,
                name=full_name,
                party=translate[d["party"]],
                image=photo_url,
            )
            person.extras["town_represented"] = d["town_represented"]
            person.add_link(detail_link)

            if d["address"] and d["address"] != "-":
                person.add_contact_detail(type="address",
                                          value=d["address"],
                                          note="District Office")

            phone = contact_info["phone"]
            if phone and validate_phone_number(phone):
                person.add_contact_detail(type="voice",
                                          value=phone,
                                          note="District Office")
            email = contact_info["email"]
            if email and validate_email_address(email):
                person.add_contact_detail(type="email",
                                          value=email,
                                          note="District Office")

            person.add_source(contact_url)
            person.add_source(contact_info["detail_link"])

            yield person
Example #3
0
    def legislators(self, latest_only):
        legs = {}

        for member, chamber, term, url in self._memberships(latest_only):
            name, _, _, district, party = member.xpath("td")
            district = district.text
            detail_url = name.xpath("a/@href")[0]

            if party.text_content().strip() == "":
                party = "Independent"
            else:
                party = {"D": "Democratic", "R": "Republican", "I": "Independent"}[
                    party.text
                ]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith("*"):
                name = name.strip("*")
                continue

            name = AKA.get(name, name)

            if name in legs:
                p, terms = legs[name]
                terms.append((chamber, district, term, party))
            else:
                p = Person(name, party=party)
                legs[name] = p, [(chamber, district, term, party)]

            p.add_source(url)
            p.add_source(detail_url)
            p.add_link(detail_url)

            birth_date = BIRTH_DATES.get(name, None)
            if birth_date:
                p.birth_date = birth_date

            leg_html = self.get(detail_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(detail_url)

            hotgarbage = (
                "Senate Biography Information for the 98th General "
                "Assembly is not currently available."
            )

            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning("No legislator bio available for " + name)
                continue

            photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0]
            p.image = photo_url

            p.contact_details = []
            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                p.add_contact_detail(
                    type="email", value=email[0].tail.strip(), note="Capitol Office"
                )

            offices = {
                "Capitol Office": '//table[contains(string(), "Springfield Office")]',
                "District Office": '//table[contains(string(), "District Office")]',
            }

            for location, xpath in offices.items():
                table = leg_doc.xpath(xpath)
                if table:
                    for type, value in self._table_to_office(table[3]):
                        if type in ("fax", "voice") and not validate_phone_number(
                            value
                        ):
                            continue

                        p.add_contact_detail(type=type, value=value, note=location)

        return legs
Example #4
0
    def clean_alternative_phone_number(self):
        if self.cleaned_data['alternative_phone_number']:
            alt_phone_no = self.cleaned_data['alternative_phone_number']
            validate_phone_number(alt_phone_no)

            return alt_phone_no
Example #5
0
    def clean_primary_phone_number(self):
        primary_phone_no = self.cleaned_data['primary_phone_number']
        validate_phone_number(primary_phone_no)

        return primary_phone_no
Example #6
0
    def clean_contact_person_phone_number(self):
        number = self.cleaned_data['contact_person_phone_number']
        validate_phone_number(number)

        return number
Example #7
0
    def scrape_lower_chamber(self, term):
        # E-mail contact is now hidden behind webforms. Sadness.

        party_map = {
            "PNP": "Partido Nuevo Progresista",
            "PPD": u"Partido Popular Democr\xe1tico",
            "PIP": u"Partido Independentista Puertorrique\u00F1o",
        }

        url = "http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx"
        page = self.lxmlize(url)

        member_nodes = self.get_nodes(page, '//li[@class="selectionRep"]')
        for member_node in member_nodes:
            member_info = member_node.text_content().strip().split("\n")

            name = re.sub(r"^Hon\.", "", member_info[0]).strip()
            district_text = member_info[-1].strip()
            if district_text == "Representante por AcumulaciĆ³n":
                district = "At-Large"
            else:
                district = district_text.replace("Representante del Distrito ",
                                                 "").strip()
            photo_url = self.get_node(member_node, ".//img/@src")

            rep_link = self.get_node(member_node, ".//a/@href")
            rep_page = self.lxmlize(rep_link)

            party_node = self.get_node(rep_page, '//span[@class="partyBio"]')
            # Albelo doesn't seem to have a "partyBio" as an independent, but we
            # expect this to exist for all other members.
            if not party_node and name == "Manuel A. Natal Albelo":
                party = "Independent"
            else:
                party_text = party_node.text_content().strip()
                party = party_map[party_text]

            address = (self.get_node(
                rep_page, "//h6").text.strip().split("\n")[0].strip())

            # Only grabs the first validated phone number found.
            # Typically, representatives have multiple phone numbers.
            phone_node = self.get_node(
                rep_page,
                '//span[@class="data-type" and contains(text(), "Tel.")]')
            phone = None
            possible_phones = phone_node.text.strip().split("\n")
            for phone_attempt in possible_phones:
                # Don't keep searching phone numbers if a good one is found.
                if phone:
                    break

                phone_text = re.sub(r"^Tel\.[\s]*", "", phone_attempt).strip()
                if validate_phone_number(phone_text):
                    phone = phone_text

            fax_node = self.get_node(
                rep_page,
                '//span[@class="data-type" and contains(text(), "Fax.")]')
            fax = None
            if fax_node:
                fax_text = fax_node.text.strip()
                fax_text = re.sub(r"^Fax\.[\s]*", "", fax_text).strip()
                if validate_phone_number(fax_text):
                    fax = fax_text

            person = Person(
                primary_org="lower",
                district=district,
                name=name,
                party=party,
                image=photo_url,
            )

            person.add_link(rep_link)
            person.add_source(rep_link)
            person.add_source(url)

            if address:
                person.add_contact_detail(type="address",
                                          value=address,
                                          note="Capitol Office")
            if phone:
                person.add_contact_detail(type="voice",
                                          value=phone,
                                          note="Capitol Office")
            if fax:
                person.add_contact_detail(type="fax",
                                          value=fax,
                                          note="Capitol Office")

            yield person
Example #8
0
    def _parse_office(self, office_node):
        """
        Gets the contact information from the provided office.
        """
        office_name_text = self.get_node(office_node,
                                         './/span[@itemprop="name"]/text()')

        if office_name_text is not None:
            office_name_text = office_name_text.strip()
        else:
            office_name_text = ()

        # Initializing default values for office attributes.
        office_name = None
        office_type = None
        street_address = None
        city = None
        state = None
        zip_code = None
        address = None
        phone = None
        fax = None

        # Determine office names/types consistent with Open States internal
        # format.
        if "Albany Office" in office_name_text:
            office_name = "Capitol Office"
            office_type = "capitol"
        elif "District Office" in office_name_text:
            office_name = "District Office"
            office_type = "district"
        else:
            # Terminate if not a capitol or district office.
            return None

        # Get office street address.
        street_address_text = self.get_node(
            office_node,
            './/div[@class="street-address"][1]/'
            'span[@itemprop="streetAddress"][1]/text()',
        )

        if street_address_text is not None:
            street_address = street_address_text.strip()

        # Get office city.
        city_text = self.get_node(office_node,
                                  './/span[@class="locality"][1]/text()')

        if city_text is not None:
            city = city_text.strip().rstrip(",")

        # Get office state.
        state_text = self.get_node(office_node,
                                   './/span[@class="region"][1]/text()')

        if state_text is not None:
            state = state_text.strip()

        # Get office postal code.
        zip_code_text = self.get_node(
            office_node, './/span[@class="postal-code"][1]/text()')

        if zip_code_text is not None:
            zip_code = zip_code_text.strip()

        # Build office physical address.
        if (street_address is not None and city is not None
                and state is not None and zip_code is not None):
            address = "{}\n{}, {} {}".format(street_address, city, state,
                                             zip_code)
        else:
            address = None

        # Get office phone number.
        phone_node = self.get_node(
            office_node, './/div[@class="tel"]/span[@itemprop="telephone"]')

        if phone_node is not None:
            phone = phone_node.text.strip()

        # Get office fax number.
        fax_node = self.get_node(
            office_node, './/div[@class="tel"]/span[@itemprop="faxNumber"]')

        if fax_node is not None:
            fax = fax_node.text.strip()
            if not validate_phone_number(fax):
                fax = None

        office = dict(name=office_name,
                      type=office_type,
                      phone=phone,
                      fax=fax,
                      address=address)

        return office
Example #9
0
    def scrape_offices(self, legislator, doc):
        # Retrieve element that should contain all contact information for the
        # legislator and turn its text into a list.
        text = doc.xpath('//b[contains(., "Capitol Office:")]')[0]
        text = text.getparent().itertext()
        text = filter(None, [t.strip() for t in text])

        # Parse capitol office contact details.
        officedata = defaultdict(list)
        current = None
        for chunk in text:
            # Skip parsing biography link.
            if chunk.lower() == "biography":
                break
            # Contact snippets should be elements with headers that end in
            # colons.
            if chunk.strip().endswith(":"):
                current_key = chunk.strip()
                current = officedata[current_key]
            elif current is not None:
                current.append(chunk)
                if current_key == "Business Phone:":
                    break

        email = doc.xpath('//a[contains(@href, "mailto:")]/@href')[1]
        email = email[7:]

        try:
            if officedata["Capitol Phone:"][0] not in ("", "NA"):
                capitol_phone = officedata["Capitol Phone:"][0]
            else:
                raise ValueError("Invalid phone number")
        except (IndexError, ValueError):
            capitol_phone = None

        if officedata["Capitol Office:"]:
            capitol_address = "\n".join(officedata["Capitol Office:"])
        else:
            capitol_address = None

        if email:
            legislator.add_contact_detail(type="email",
                                          value=email,
                                          note="Capitol Office")

        if capitol_phone and validate_phone_number(capitol_phone):
            legislator.add_contact_detail(type="voice",
                                          value=capitol_phone,
                                          note="Capitol Office")

        if capitol_address:
            legislator.add_contact_detail(type="address",
                                          value=capitol_address,
                                          note="Capitol Office")

        # If a business or home phone is listed, attempt to use the
        # home phone first, then fall back on the business phone for
        # the district office number.
        try:
            if officedata["Home Phone:"][0] not in ("", "NA"):
                district_phone = officedata["Home Phone:"][0]
            elif officedata["Business Phone:"][0] not in ("", "NA"):
                district_phone = officedata["Business Phone:"][0]
            else:
                raise ValueError("Invalid phone number")
        except (IndexError, ValueError):
            district_phone = None

        if officedata["Home:"]:
            district_address = "\n".join(officedata["Home:"])
        else:
            district_address = None

        # Add district office entry only if data exists for it.
        if district_phone and validate_phone_number(district_phone):
            legislator.add_contact_detail(type="voice",
                                          value=district_phone,
                                          note="District Office")

        if district_address:
            legislator.add_contact_detail(type="address",
                                          value=district_address,
                                          note="District Office")