Example #1
0
    def handle_page(self):
        # don't use handle_page_item because we need to look back at prior element
        parent = None

        for item in self.doc.xpath(self.list_xpath):
            cssclass = item.attrib.get("class", "")
            name = item.text_content().strip()

            if "parentcommittee" in cssclass:
                parent = None
                chamber = "lower"

            comm = Organization(name=name,
                                classification="committee",
                                chamber=chamber,
                                parent_id=parent)
            yield self.scrape_page(HouseComDetail,
                                   item.attrib["href"],
                                   obj=comm)

            # parent for next time
            if "parentcommittee" in cssclass:
                parent = comm._id
                chamber = None
    def scrape_lower_committee(self, link, name):
        url = re.sub(r"\s+", "", link.attrib["href"])
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        comm = Organization(name=name,
                            chamber="lower",
                            classification="committee")
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r"^Delegate\s+", "", name)
            role = link.getnext().text or "member"
            comm.add_member(name, role.strip())

        return comm
Example #3
0
    def scrape_committee(self, term, href, name):
        page = self.get(href).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(href)
        members = page.xpath("//div[@class='view-content']"
                             "//a[contains(@href, 'members')]")

        if "/joint/" in href:
            chamber = "legislature"
        elif "/senate/" in href:
            chamber = "upper"
        elif "/house/" in href:
            chamber = "lower"
        else:
            # interim committees and others were causing duplicate committee issues, skipping
            self.warning(
                "Failed to identify chamber for {}; skipping".format(href))
            return

        cttie = Organization(name, chamber=chamber, classification="committee")
        for a in members:
            member = a.text
            role = a.xpath(
                "ancestor::div/h2[@class='pane-title']/text()")[0].strip()
            role = {
                "Legislative Members": "member",
                "Chairman": "chair",
                "Vice Chairman": "member",
            }[role]

            if member is None or member.startswith("District"):
                continue

            member = member.replace("Senator ",
                                    "").replace("Representative ", "")

            cttie.add_member(member, role=role)

        cttie.add_source(href)
        yield cttie
Example #4
0
    def scrape_committees(self, session):
        session_key = SESSION_KEYS[session]
        committees_response = self.api_client.get("committees", session=session_key)

        legislators = index_legislators(self, session_key)

        for committee in committees_response:
            org = Organization(
                chamber={"S": "upper", "H": "lower", "J": "legislature"}[
                    committee["HouseOfAction"]
                ],
                name=committee["CommitteeName"],
                classification="committee",
            )
            org.add_source(
                "https://olis.leg.state.or.us/liz/{session}"
                "/Committees/{committee}/Overview".format(
                    session=session_key, committee=committee["CommitteeName"]
                )
            )
            members_response = self.api_client.get(
                "committee_members",
                session=session_key,
                committee=committee["CommitteeCode"],
            )
            for member in members_response:
                try:
                    member_name = legislators[member["LegislatorCode"]]
                except KeyError:
                    logger.warn(
                        "Legislator {} not found in session {}".format(
                            member["LegislatorCode"], session_key
                        )
                    )
                    member_name = member["LegislatorCode"]
                org.add_member(
                    member_name, role=member["Title"] if member["Title"] else ""
                )

            yield org
    def _scrape_lower_standing_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee = Organization(
            committee_name, chamber="lower", classification="committee"
        )
        committee.add_source(url)

        rows = page.xpath(
            '//table[@id="body_ListView1_itemPlaceholderContainer"]'
            '/tr[@class="linkStyle2"]'
        )

        for row in rows:
            member_name = row.xpath("normalize-space(string(./td[1]/a))")
            member_name = self._normalize_member_name(member_name)
            member_role = row.xpath("normalize-space(string(./td[2]))")
            member_role = self._normalize_member_role(member_role)

            committee.add_member(member_name, member_role)

        yield committee
Example #6
0
    def scrape_committee(self, chamber, link, parent_comm=None):
        home_link = link.attrib["href"]
        name = re.sub(r"\s+\((H|S)\)$", "", link.text).strip().title()
        name = name.replace(".", "").strip()
        if "Subcommittee " in name and parent_comm:
            name = name.split("Subcommittee")[1]
            name = name.replace(" on ", "").replace(" On ", "")
            name = name.strip()
            comm = Organization(name,
                                parent_id=self.parents[parent_comm],
                                classification="committee")
        else:
            for c in ["Committee", "Comm", "Sub", "Subcommittee"]:
                if name.endswith(c):
                    name = name[:-1 * len(c)].strip()
            comm = Organization(name,
                                chamber=chamber,
                                classification="committee")
            self.parents[name] = comm._id
        comm.add_source(home_link)
        comm_url = home_link.replace("home.htm", "members.htm")
        self.scrape_members(comm, comm_url)

        if comm._related:
            yield comm
        else:
            self.logger.warning("Empty committee, skipping.")

        # deal with subcommittees
        if parent_comm is None:
            # checking parent_comm so we don't look for subcommittees
            # in subcommittees leaving us exposed to infinity
            page = self.get(home_link).text
            page = lxml.html.fromstring(page)
            page.make_links_absolute(home_link)
            sub_links = page.xpath("//li/a[contains(@href, '/home.htm')]")
            for l in sub_links:
                if "committee" in l.text.lower():
                    yield from self.scrape_committee(chamber, l, name)
    def scrape_upper_committee(self, name, url):
        page = lxml.html.fromstring(self.get(url).text)

        comm = Organization(name=name, chamber="upper", classification="committee")
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'biographies')]"):
            member = link.xpath("string()").strip()
            member = re.sub(r"\s+", " ", member)
            if not member:
                continue
            role = link.tail
            if not role:
                role = "member"
            elif "Vice Chair" in role:
                role = "vice chair"
            elif "Chair" in role:
                role = "chair"
            member = member.replace("Senator ", "")
            comm.add_member(member, role=role)

        if not comm._related:
            raise Exception("no members for %s", comm.name)
        yield comm
Example #8
0
def test_vote_event_bill_actions_two_stage():
    # this test is very similar to what we're testing in test_vote_event_bill_actions w/
    # ve3 and ve4, that two bills that reference the same action won't conflict w/ the
    # OneToOneField, but in this case we do it in two stages so that the conflict is found
    # even if the votes weren't in the same scrape
    j = create_jurisdiction()
    j.legislative_sessions.create(name="1900", identifier="1900")
    org1 = ScrapeOrganization(name="House", classification="lower")
    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      from_organization=org1._id)

    bill.add_action(description="passage", date="1900-04-02", chamber="lower")

    ve1 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        organization=org1._id,
    )
    ve2 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        organization=org1._id,
    )
    # disambiguate them
    ve1.pupa_id = "one"
    ve2.pupa_id = "two"

    oi = OrganizationImporter("jid")
    oi.import_data([org1.as_dict()])

    bi = BillImporter("jid", oi, DumbMockImporter())
    bi.import_data([bill.as_dict()])

    # first imports just fine
    VoteEventImporter("jid", DumbMockImporter(), oi,
                      bi).import_data([ve1.as_dict()])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 1
    assert votes[0].bill_action is not None

    # when second is imported, ensure that action stays pinned to first just as it would
    # have if they were both in same import
    VoteEventImporter("jid", DumbMockImporter(), oi,
                      bi).import_data([ve1.as_dict(),
                                       ve2.as_dict()])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 2
    assert votes[0].bill_action is not None
    assert votes[1].bill_action is None
Example #9
0
def test_vote_event_bill_actions():
    j = create_jurisdiction()
    j.legislative_sessions.create(name="1900", identifier="1900")
    org1 = ScrapeOrganization(name="House", classification="lower")
    org2 = ScrapeOrganization(name="Senate", classification="upper")
    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      from_organization=org1._id)

    # add actions, passage of upper & lower on same day, something else,
    # then passage in upper again on a different day
    bill.add_action(description="passage", date="1900-04-01", chamber="upper")
    bill.add_action(description="passage", date="1900-04-01", chamber="lower")
    bill.add_action(description="other event",
                    date="1900-04-01",
                    chamber="lower")
    bill.add_action(description="passage", date="1900-04-02", chamber="upper")

    # four passage votes, one per chamber, one on 04-01, and one on 04-02
    ve1 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-01",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        organization=org1._id,
    )
    ve2 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-01",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        organization=org2._id,
    )
    ve3 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        organization=org1._id,
    )
    ve4 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        organization=org2._id,
    )

    oi = OrganizationImporter("jid")
    oi.import_data([org1.as_dict(), org2.as_dict()])

    bi = BillImporter("jid", oi, DumbMockImporter())
    bi.import_data([bill.as_dict()])

    VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data(
        [ve1.as_dict(),
         ve2.as_dict(),
         ve3.as_dict(),
         ve4.as_dict()])

    bill = Bill.objects.get()
    votes = list(VoteEvent.objects.all())
    actions = list(bill.actions.all())
    assert len(actions) == 4
    assert len(votes) == 4

    votes = {(v.organization.classification, v.start_date): v.bill_action
             for v in votes}

    # ensure that votes are matched using action, chamber, and date
    assert votes[("upper", "1900-04-01")] == actions[0]
    assert votes[("lower", "1900-04-01")] == actions[1]
    assert votes[("upper", "1900-04-02")] == actions[3]
    assert votes[("lower", "1900-04-02")] is None
Example #10
0
    def scrape_chamber(self, chamber):
        committee_list_urls = {
            "lower":
            "https://capitol.texas.gov/Committees/"
            "CommitteesMbrs.aspx?Chamber=H",
            "upper":
            "https://capitol.texas.gov/Committees/"
            "CommitteesMbrs.aspx?Chamber=S",
        }

        committee_list_url = committee_list_urls[chamber]
        committee_list_page = self.lxmlize(committee_list_url)

        committee_nodes = self.get_nodes(
            committee_list_page, '//form[@id="ctl00"]//a[@id="CmteList"]')

        for committee_node in committee_nodes:
            committee_name = committee_node.text.strip()
            committee = Organization(name=committee_name,
                                     chamber=chamber,
                                     classification="committee")

            # Get the committee profile page.
            committee_page_url = committee_node.get("href")
            committee_page = self.lxmlize(committee_page_url)

            # Capture table with committee membership data.
            details_table = self.get_node(committee_page,
                                          '//div[@id="content"]//table[2]')
            if details_table is not None:
                # Skip the first row because it currently contains only headers
                detail_rows = self.get_nodes(details_table, "./tr")[1:]
                for detail_row in detail_rows:
                    label_text = self.get_node(detail_row, "./td[1]//text()")

                    if label_text:
                        label_text = label_text.strip().rstrip(":")

                    if label_text in ("Chair", "Vice Chair"):
                        member_role = "chair"
                    else:
                        member_role = "member"

                    member_name_text = self.get_node(detail_row,
                                                     "./td[2]/a/text()")

                    # Clean titles from member names.
                    if chamber == "upper":
                        member_name = re.sub(r"^Sen\.[\s]*", "",
                                             member_name_text)
                    elif chamber == "lower":
                        member_name = re.sub(r"^Rep\.[\s]*", "",
                                             member_name_text)

                    # Collapse multiple whitespaces in member names.
                    member_name = re.sub(r"[\s]{2,}", " ", member_name).strip()

                    committee.add_member(member_name, member_role)

            committee.add_source(committee_list_url)
            committee.add_source(committee_page_url)

            yield committee
 def get_organizations(self):
     parent = Organization("Congress", classification="legislature")
     yield parent
     yield Organization("House", classification="lower", parent_id=parent)
     yield Organization("Senate", classification="upper", parent_id=parent)
Example #12
0
    def scrape_upper(self):
        # Retrieve index list of committees.
        url = "http://senate.ca.gov/committees"
        doc = self.lxmlize(url)

        standing_committees = doc.xpath(
            '//h2[text()="Standing Committees"]/../following-sibling::div//a')
        sub_committees = doc.xpath(
            '//h2[text()="Sub Committees"]/../following-sibling::div//a')
        joint_committees = doc.xpath(
            '//h2[text()="Joint Committees"]/../following-sibling::div//a')
        other_committees = doc.xpath(
            '//h2[text()="Other"]/../following-sibling::div//a')

        # Iterates over each committee [link] found.
        for committee in (standing_committees + sub_committees +
                          joint_committees + other_committees):
            # Get the text of the committee link, which should be the name of
            # the committee.
            (comm_name, ) = committee.xpath("text()")

            (comm_url, ) = committee.xpath("@href")
            comm_doc = self.lxmlize(comm_url)

            if comm_name.startswith("Joint"):
                org = Organization(chamber="legislature",
                                   classification="committee",
                                   name=comm_name)
            elif comm_name.startswith("Subcommittee"):
                (parent_name,
                 ) = comm_doc.xpath('//div[@class="banner-sitename"]/a/text()')
                (subcom_name, ) = comm_doc.xpath('//h1[@class="title"]/text()')
                org = Organization(
                    name=subcom_name.strip(),
                    classification="committee",
                    parent_id={
                        "name": parent_name,
                        "classification": "upper"
                    },
                )
            else:
                org = Organization(chamber="upper",
                                   name=comm_name,
                                   classification="committee")

            org.add_source(comm_url)

            # Special case of members list being presented in text blob.
            member_blob = comm_doc.xpath(
                'string(//div[contains(@class, "field-item") and '
                'starts-with(text(), "Senate Membership:")][1]/text()[1])')

            if member_blob:
                # Separate senate membership from assembly membership.
                # This should strip the header from assembly membership
                # string automatically.
                delimiter = "Assembly Membership:\n"
                senate_members, delimiter, assembly_members = member_blob.partition(
                    delimiter)

                # Strip header from senate membership string.
                senate_members = senate_members.replace(
                    "Senate Membership:\n", "")

                # Clean membership strings.
                senate_members = senate_members.strip()
                assembly_members = assembly_members.strip()

                # Parse membership strings into lists.
                senate_members = senate_members.split("\n")
                assembly_members = assembly_members.split("\n")

                members = senate_members + assembly_members
            # Typical membership list format.
            else:
                members = comm_doc.xpath(
                    '//a[(contains(@href, "/sd") or '
                    'contains(@href, "assembly.ca.gov/a")) and '
                    '(starts-with(text(), "Senator") or '
                    'starts-with(text(), "Assembly Member"))]/text()')

            for member in members:
                if not member.strip():
                    continue

                (mem_name, mem_role) = re.search(
                    r"""(?ux)
                        ^(?:Senator|Assembly\sMember)\s  # Legislator title
                        (.+?)  # Capture the senator's full name
                        (?:\s\((.{2,}?)\))?  # There may be role in parentheses
                        (?:\s\([RD]\))?  # There may be a party affiliation
                        \s*$
                        """,
                    member,
                ).groups()
                org.add_member(mem_name,
                               role=mem_role if mem_role else "member")

            if not org._related:
                self.warning(
                    "No members found for committee {}".format(comm_name))

            yield org
Example #13
0
    def scrape_chamber(self, chamber):
        if chamber == "lower":
            url = "http://www.scstatehouse.gov/member.php?chamber=H"
        else:
            url = "http://www.scstatehouse.gov/member.php?chamber=S"

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[@class="membername"]'):
            full_name = a.text
            leg_url = a.get("href")

            if full_name.startswith("Senator"):
                full_name = full_name.replace("Senator ", "")
            if full_name.startswith("Representative"):
                full_name = full_name.replace("Representative ", "")

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if "Resigned effective" in leg_html:
                self.info("Resigned")
                continue

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px;'
                ' margin: 0 0 0 0; padding: 0;"]/text()')

            if "Republican" in party:
                party = "Republican"
            elif "Democrat" in party:
                party = "Democratic"

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath(
                    '//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ""
            person = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=photo_url,
            )

            # capitol office address
            try:
                capitol_address = lxml.etree.tostring(
                    leg_doc.xpath('//h2[text()="Columbia Address"]/../p[1]')
                    [0]).decode()
                if capitol_address:
                    capitol_address = parse_address(capitol_address)
                    person.add_contact_detail(type="address",
                                              value=capitol_address,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol address for {0}".format(full_name))

            # capitol office phone
            try:
                capitol_phone = (
                    leg_doc.xpath('//h2[text()="Columbia Address"]/../p[2]')
                    [0].text_content().strip())
                label, number = parse_phone(capitol_phone)
                if number:
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol phone for {0}".format(full_name))

            # home address
            try:
                home_address = lxml.etree.tostring(
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[1]')
                    [0]).decode()
                if home_address:
                    home_address = parse_address(home_address)
                    person.add_contact_detail(type="address",
                                              value=home_address,
                                              note="District Office")
            except IndexError:
                self.warning("no home address for {0}".format(full_name))

            # home or business phone
            try:
                home_phone = (
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[2]')
                    [0].text_content().strip())
                label, number = parse_phone(home_phone)
                if number:
                    label = ("Primary Office"
                             if label == "Business" else "District Office")
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note=label)
            except IndexError:
                self.warning(
                    "no home or business phone for {0}".format(full_name))

            # business or home phone
            try:
                business_phone = (
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[3]')
                    [0].text_content().strip())
                label, number = parse_phone(business_phone)
                if number:
                    label = ("Primary Office"
                             if label == "Business" else "District Office")
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note=label)
            except IndexError:
                pass

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(", "):
                    committee, role = com.text_content().rsplit(", ", 1)

                    # known roles
                    role = {
                        "Treas.": "treasurer",
                        "Secy.": "secretary",
                        "Secy./Treas.": "secretary/treasurer",
                        "V.C.": "vice-chair",
                        "1st V.C.": "first vice-chair",
                        "Co 1st V.C.": "co-first vice-chair",
                        "2nd V.C.": "second vice-chair",
                        "3rd V.C.": "third vice-chair",
                        "Ex.Officio Member": "ex-officio member",
                        "Chairman": "chairman",
                    }[role]
                else:
                    committee = com.text
                    role = "member"

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee,
                                       classification="committee",
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
    def _scrape_lower_chamber(self, session):
        self.info("Scraping lower chamber for committees.")

        chamber = "lower"

        url = "{base}CommitteeHierarchy.aspx".format(base=self._reps_url_base)
        page_string = self.get(url).text
        page = lxml.html.fromstring(page_string)
        # Last tr has the date
        committee_links = page.xpath("//li//a")
        for committee_link in committee_links:
            committee_name = committee_link.text_content().strip()
            committee_url = committee_link.attrib.get("href")

            committee_url = "{base}{members}{url}".format(
                base=self._reps_url_base,
                members=
                "MemberGridCluster.aspx?filter=compage&category=committee&",
                url=committee_url,
            )
            actual_chamber = chamber
            if "joint" in committee_name.lower():
                actual_chamber = "legislature"

            committee_name = committee_name.replace("Committee On ", "")
            committee_name = committee_name.replace("Special", "")
            committee_name = committee_name.replace("Select", "")
            committee_name = committee_name.replace("Special", "")
            committee_name = committee_name.replace("Joint", "")
            committee_name = committee_name.replace(" Committee", "")
            committee_name = committee_name.strip()

            committee = Organization(committee_name,
                                     chamber=actual_chamber,
                                     classification="committee")

            committee_page_string = self.get(committee_url).text
            committee_page = lxml.html.fromstring(committee_page_string)
            # First tr has the title (sigh)
            mem_trs = committee_page.xpath(
                "//table[@id='gvMembers_DXMainTable']//tr[contains(@class, 'dxgvDataRow')]"
            )
            for mem_tr in mem_trs:
                mem_code = None
                mem_links = mem_tr.xpath("td/a[1]")

                mem_role_string = mem_tr.xpath(
                    "td[4]")[0].text_content().strip()

                if len(mem_links):
                    mem_code = mem_links[0].attrib.get("href")
                # Output is "Rubble, Barney, Neighbor"

                mem_parts = mem_tr.xpath(
                    "td[2]")[0].text_content().strip().split(",")
                if self._no_members_text in mem_parts:
                    continue
                mem_name = mem_parts[1].strip() + " " + mem_parts[0].strip()
                # Sometimes Senator abbreviation is in the name
                mem_name = mem_name.replace("Sen. ", "")
                mem_name = mem_name.replace("Rep. ", "")

                mem_role = "member"

                if len(mem_role_string) > 2:
                    mem_role = mem_role_string.lower()

                membership = committee.add_member(mem_name, role=mem_role)
                membership.extras = {"code": mem_code}

            committee.add_source(url)
            committee.add_source(committee_url)

            yield committee
Example #15
0
    def scrape(self):
        session = self.latest_session()

        subcomms = self.get_subcommittee_info(session)

        api_base_url = "https://api.iga.in.gov"
        html_base_url = "http://iga.in.gov/legislative/{}/committees/".format(
            session)
        client = ApiClient(self)
        r = client.get("committees", session=session)
        all_pages = client.unpaginate(r)
        for comm_info in all_pages:
            # this is kind of roundabout, but needed in order
            # to take advantage of all of our machinery to make
            # sure we're not overloading their api
            comm_link = comm_info["link"]
            comm_name = comm_link.split("/")[-1]
            if "withdrawn" in comm_name or "conference" in comm_name:
                continue
            try:
                comm_json = client.get("committee",
                                       committee_link=comm_link[1:])
            except HTTPError:
                self.logger.warning("Page does not exist")
                continue
            try:
                chamber = comm_json["chamber"]["name"]
            except KeyError:
                chamber = "joint"
            else:
                if chamber == "Senate":
                    chamber = "upper"
                elif chamber == "House":
                    chamber = "lower"
                else:
                    raise AssertionError(
                        "Unknown committee chamber {}".format(chamber))

            name = comm_json["name"]
            try:
                owning_comm = subcomms[name]
            except KeyError:
                name = name.replace("Statutory Committee on", "").strip()
                comm = Organization(name=name,
                                    chamber=chamber,
                                    classification="committee")
                if name in subcomms.values():
                    # Avoid identification issues, if committee names are re-used
                    # between upper and lower chambers
                    assert self._parent_committees.get(name) is None
                    self._parent_committees[name] = comm
            else:
                name = (name.replace("Statutory Committee on",
                                     "").replace("Subcommittee", "").strip())
                comm = Organization(
                    name=name,
                    parent_id=self._parent_committees[owning_comm],
                    classification="committee",
                )

            chair = self.process_special_members(comm, comm_json, "chair")
            vicechair = self.process_special_members(comm, comm_json,
                                                     "viceChair")
            ranking = self.process_special_members(comm, comm_json,
                                                   "rankingMinMember")

            # leadership is also listed in membership
            # so we have to make sure we haven't seen them yet
            comm_members = [m for m in [chair, vicechair, ranking] if m]

            for mem in comm_json["members"]:
                mem_name = mem["firstName"] + " " + mem["lastName"]
                if mem_name not in comm_members:
                    comm_members.append(mem_name)
                    comm.add_member(mem_name)

            api_source = api_base_url + comm_link

            if comm_name[:10] == "committee_":
                html_source = html_base_url + comm_name[10:]

            comm.add_source(html_source)
            comm.add_source(api_source)
            yield comm
Example #16
0
    def scrape_comm(self, url, chamber):
        data = self.post(url).json()["Data"]

        for item in data:
            comm_name = item["CommitteeName"]
            committee = Organization(name=comm_name,
                                     chamber=chamber,
                                     classification="committee")
            chair_man = str(item["ChairName"])
            vice_chair = str(item["ViceChairName"])
            comm_id = item["CommitteeId"]
            comm_url = self.get_comm_url(chamber, comm_id, comm_name)
            members = self.scrape_member_info(comm_url)
            if vice_chair != "None":
                committee.add_member(vice_chair, role="Vice-Chair")
            if chair_man != "None":
                committee.add_member(chair_man, role="Chairman")

            for member in members:
                # vice_chair and chair_man already added.
                if chair_man not in member and vice_chair not in member:
                    member = " ".join(member.split())
                    if member:
                        committee.add_member(member)

            committee.add_source(comm_url)
            committee.add_source(url)
            yield committee
Example #17
0
    def scrape_committees_pdf(self, year, chamber, filename, url):
        if chamber == "lower" and year == "2015":
            text = self._fix_house_text(filename).decode()
        else:
            text = convert_pdf(filename, type="text-nolayout").decode()

        for hotgarbage, replacement in (
            (
                r"Judicial Branch, Law Enforcement,\s+and\s+Justice",
                "Judicial Branch, Law Enforcement, and Justice",
            ),
            (
                r"Natural Resources and\s+Transportation",
                "Natural Resources and Transportation",
            ),
            (
                r"(?u)Federal Relations, Energy,?\s+and\s+Telecommunications",
                "Federal Relations, Energy, and Telecommunications",
            ),
        ):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: "Agriculture" not in s, lines)

        comm = None
        for line in lines:
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if "Subcommittees" in line:
                self.warning("Currently, we're skipping subcommittees")
                # https://github.com/openstates/openstates/issues/2099
                break
            if is_committee_name(line):
                if comm and comm._related:
                    yield comm

                committee = line.strip()
                comm = Organization(name=committee,
                                    chamber=chamber,
                                    classification="committee")

                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit("(", 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(" Ch", party):
                    role = "chair"
                elif " VCh" in party:
                    role = "vice chair"
                elif " MVCh" in party:
                    role = "minority vice chair"
                else:
                    role = "member"
                comm.add_member(name, role)

        if comm._related:
            yield comm
def test_full_bill():
    create_jurisdiction()
    sp = ScrapePerson("Adam Smith")
    org = ScrapeOrganization(name="House", classification="lower")
    com = ScrapeOrganization(
        name="Arbitrary Committee", classification="committee", parent_id=org._id
    )

    oldbill = ScrapeBill(
        "HB 99",
        "1899",
        "Axe & Tack Tax Act",
        classification="tax bill",
        from_organization=org._id,
    )

    bill = ScrapeBill(
        "HB 1",
        "1900",
        "Axe & Tack Tax Act",
        classification="tax bill",
        from_organization=org._id,
    )
    bill.subject = ["taxes", "axes"]
    bill.add_identifier("SB 9")
    bill.add_title("Tack & Axe Tax Act")
    bill.add_action("introduced in house", "1900-04-01", chamber="lower")
    act = bill.add_action("sent to arbitrary committee", "1900-04-04", chamber="lower")
    act.add_related_entity("arbitrary committee", "organization", com._id)
    bill.add_related_bill(
        "HB 99", legislative_session="1899", relation_type="prior-session"
    )
    bill.add_sponsorship(
        "Adam Smith",
        classification="extra sponsor",
        entity_type="person",
        primary=False,
        entity_id=sp._id,
    )
    bill.add_sponsorship(
        "Jane Smith", classification="lead sponsor", entity_type="person", primary=True
    )
    bill.add_abstract(
        "This is an act about axes and taxes and tacks.",
        note="official",
        date="1969-10-20",
    )
    bill.add_document_link(
        "Fiscal Note", "http://example.com/fn.pdf", media_type="application/pdf"
    )
    bill.add_document_link(
        "Fiscal Note", "http://example.com/fn.html", media_type="text/html"
    )
    bill.add_version_link(
        "Fiscal Note", "http://example.com/v/1", media_type="text/html"
    )
    bill.add_source("http://example.com/source")

    # import bill
    oi = OrganizationImporter("jid")
    oi.import_data([org.as_dict(), com.as_dict()])

    pi = PersonImporter("jid")
    pi.import_data([sp.as_dict()])

    BillImporter("jid", oi, pi).import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier="HB 1")
    assert b.from_organization.classification == "lower"
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ["taxes", "axes"]
    assert b.abstracts.get().note == "official"
    assert b.abstracts.get().date == "1969-10-20"

    # other_title, other_identifier added
    assert b.other_titles.get().title == "Tack & Axe Tax Act"
    assert b.other_identifiers.get().identifier == "SB 9"

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(classification="lower")
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert actions[1].related_entities.get().organization == Organization.objects.get(
        classification="committee"
    )

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == "HB 99"

    # and bill got resolved
    assert rb.related_bill.identifier == "HB 99"

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name="Adam Smith")
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
 def __missing__(self, key):
     val = Organization(chamber="legislature",
                        name=key,
                        classification="committee")
     self[key] = val
     return val
Example #20
0
def test_vote_event_bill_actions_errors():
    j = create_jurisdiction()
    j.legislative_sessions.create(name="1900", identifier="1900")
    org1 = ScrapeOrganization(name="House", classification="lower")
    org2 = ScrapeOrganization(name="Senate", classification="upper")
    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      from_organization=org1._id)

    # for this bill, two identical actions, so vote matching will fail
    bill.add_action(description="passage", date="1900-04-01", chamber="lower")
    bill.add_action(description="passage", date="1900-04-01", chamber="lower")
    # this action is good, but two votes will try to match it
    bill.add_action(description="passage", date="1900-04-02", chamber="lower")

    # will match two actions
    ve1 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-01",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        identifier="1",
        bill_action="passage",
        organization=org1._id,
    )
    # will match no actions
    ve2 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-01",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        identifier="2",
        bill_action="committee result",
        organization=org1._id,
    )
    # these two votes will both match the same action
    ve3 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        identifier="3",
        bill_action="passage",
        organization=org1._id,
    )
    ve4 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage-syz",
        start_date="1900-04-02",
        classification="passage:bill",
        result="fail",
        bill_chamber="lower",
        bill="HB 1",
        identifier="4",
        bill_action="passage",
        organization=org1._id,
    )

    oi = OrganizationImporter("jid")
    oi.import_data([org1.as_dict(), org2.as_dict()])
    bi = BillImporter("jid", oi, DumbMockImporter())
    bi.import_data([bill.as_dict()])

    VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data(
        [ve1.as_dict(),
         ve2.as_dict(),
         ve3.as_dict(),
         ve4.as_dict()])

    bill = Bill.objects.get()
    votes = list(VoteEvent.objects.all().order_by("identifier"))

    # isn't matched, was ambiguous across two actions
    assert votes[0].bill_action is None
    # isn't matched, no match in actions
    assert votes[1].bill_action is None

    # these both try to match the same action, only first will succeed
    assert votes[2].bill_action is not None
    assert votes[3].bill_action is None
    def scrape_current(self, chamber):
        if chamber == "upper":
            chambers = ["special_committees", "senate_committees"]
        else:
            chambers = ["house_committees"]

        committee_request = self.get(ksapi.url + "ctte/").text
        committee_json = json.loads(committee_request)

        for com_type in chambers:
            committees = committee_json["content"][com_type]

            for committee_data in committees:

                # set to joint if we are using the special_committees
                com_chamber = ("legislature" if com_type
                               == "special_committees" else chamber)

                committee = Organization(
                    committee_data["TITLE"],
                    chamber=com_chamber,
                    classification="committee",
                )

                com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"]
                try:
                    detail_json = self.get(com_url).text
                except scrapelib.HTTPError:
                    self.warning("error fetching committee %s" % com_url)
                    continue
                details = json.loads(detail_json)["content"]
                for chair in details["CHAIR"]:
                    if chair.get("FULLNAME", None):
                        chair_name = chair["FULLNAME"]
                    else:
                        chair_name = self.parse_kpid(chair["KPID"])
                        self.warning("no FULLNAME for %s", chair["KPID"])
                    committee.add_member(chair_name, "chairman")
                for vicechair in details["VICECHAIR"]:
                    committee.add_member(vicechair["FULLNAME"],
                                         "vice-chairman")
                for rankedmember in details["RMMEM"]:
                    committee.add_member(rankedmember["FULLNAME"],
                                         "ranking member")
                for member in details["MEMBERS"]:
                    committee.add_member(member["FULLNAME"])

                if not committee._related:
                    self.warning("skipping blank committee %s" %
                                 committee_data["TITLE"])
                else:
                    committee.add_source(com_url)
                    yield committee
def test_basic_invalid_organization():
    orga = Organization("name")

    # no source
    with pytest.raises(ScrapeValueError):
        orga.validate()
    def _scrape_upper_chamber(self, session):
        self.info("Scraping upper chamber for committees.")

        chamber = "upper"

        if self._is_post_2015 and self.latest_session() != session:
            url = "{base}{year}web/standing-committees".format(
                base=self._senate_url_base, year=session[2:])
            comm_container_id = "primary"
        elif session == self.latest_session():
            url = "{base}standing-committees".format(
                base=self._senate_url_base)
            comm_container_id = "primary"
        else:
            url = "{base}{year}info/com-standing.htm".format(
                base=self._senate_url_base, year=session[2:])
            comm_container_id = "mainContent"

        page = self.lxmlize(url)

        comm_links = self.get_nodes(
            page, '//div[@id = "{}"]//p/a'.format(comm_container_id))

        for comm_link in comm_links:
            # Normalize to uppercase - varies between "Assigned bills" and "Assigned Bills"
            if "ASSIGNED BILLS" in comm_link.text_content().upper():
                continue

            comm_link = comm_link.attrib["href"]

            if self._is_post_2015:
                if "web" not in comm_link:
                    continue
            else:
                if "comm" not in comm_link:
                    continue

            comm_page = self.lxmlize(comm_link)

            if self._is_post_2015:
                comm_name = self.get_node(comm_page,
                                          '//h1[@class="entry-title"]/text()')
                members = self.get_nodes(
                    comm_page, '//div[@id="bwg_standart_thumbnails_0"]/a')
            else:
                comm_name = self.get_node(comm_page,
                                          '//div[@id="mainContent"]/p/text()')
                members = self.get_nodes(comm_page,
                                         '//div[@id="mainContent"]//td/a')

            comm_name = comm_name.replace(" Committee", "")
            comm_name = comm_name.strip()

            committee = Organization(comm_name,
                                     chamber=chamber,
                                     classification="committee")

            for member in members:
                mem_link = member.attrib.get("href", "")
                if "mem" not in mem_link:
                    continue

                if self._is_post_2015:
                    mem_parts = self.get_node(
                        member, './/span[@class="bwg_title_spun2_0"]')

                mem_parts = member.text_content().strip().split(",")
                # Senator title stripping mainly for post-2015.
                mem_name = re.sub(r"^Senator[\s]+", "", mem_parts[0])

                # this one time, MO forgot the comma between
                # the member and his district. Very rarely relevant
                try:
                    int(mem_name[-4:-2]
                        )  # the district's # is in this position
                except ValueError:
                    pass
                else:
                    mem_name = " ".join(
                        mem_name.split(" ")[0:-1])  # member name fixed

                    # ok, so this next line. We don't care about
                    # the first 2 elements of mem_parts anymore
                    # so whatever. But if the member as a role, we want
                    # to make sure there are 3 elements in mem_parts and
                    # the last one is actually the role. This sucks, sorry.
                    mem_parts.append(mem_parts[-1])

                mem_role = "member"
                if len(mem_parts) > 2:
                    mem_role = mem_parts[2].lower().split("    ")[0].strip()

                if mem_name == "":
                    continue

                committee.add_member(mem_name, role=mem_role)

            committee.add_source(url)
            committee.add_source(comm_link)

            yield committee
def test_no_source_on_party_org():
    org = Organization("Hat", classification="party")
    # no source? no problem because classification = party
    org.validate()
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",")
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {"H": "lower", "S": "upper"}[row["office code"]]

            district = row["dist"].lstrip("0")
            assert district.isdigit(), "Invalid district found: {}".format(
                district)

            name = row["first name"]
            mid = row["middle initial"].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row["last name"]
            suffix = row["suffix"].strip()
            if suffix:
                name += " %s" % suffix

            party = row["party"]
            if party == "Democrat":
                party = "Democratic"

            leg = Person(primary_org=chamber,
                         name=name,
                         district=district,
                         party=party)

            legislator_url = row["URL"].replace("\\", "//").strip()
            if legislator_url != "":
                if not legislator_url.startswith("http"):
                    legislator_url = "http://"
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row["capitol street address"],
                row["room number"],
            )
            # extra_office_fields = dict()
            email = row["email"].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith("http://") or email.startswith(
                        "https://"):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError(
                        "Problematic email found: {}".format(email))
            leg.add_contact_detail(type="address",
                                   value=office_address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=row["capitol phone"],
                                   note="Capitol Office")
            if email:
                leg.add_contact_detail(type="email", value=email)

            home_address = "{}\n{}, {} {}".format(
                row["home street address"],
                row["home city"],
                row["home state"],
                row["home zip code"],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(type="address",
                                       value=home_address,
                                       note="District Office")
                if row["home phone"].strip():
                    leg.add_contact_detail(type="voice",
                                           value=row["home phone"],
                                           note="District Office")
            leg.add_source(leg_url)

            for comm_name in row["committee member1"].split(";"):
                if " (" in comm_name:
                    comm_name, role = comm_name.split(" (")
                    role = role.strip(")").lower()
                else:
                    role = "member"
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(comm_name,
                                           classification="committee",
                                           chamber=chamber)
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
Example #26
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        name = self._fix_committee_name(name)
        name = self._fix_committee_case(name)

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # Get the subcommittee name.
        xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()'

        if subcommittee:
            subcommittee = page.xpath(xpath)
            if subcommittee:
                subcommittee = page.xpath(xpath).pop(0)
                subcommittee = self._fix_committee_name(
                    subcommittee, parent=name, subcommittee=True
                )
                subcommittee = self._fix_committee_case(subcommittee)
            else:
                subcommittee = None

        # Dedupe.
        if (chamber, name, subcommittee) in self._seen:
            return
        self._seen.add((chamber, name, subcommittee))

        comm = Organization(chamber=chamber, name=name, classification="committee")
        comm.add_source(url)

        member_nodes = page.xpath('//table[@class="dxgvTable"]/tr')

        for member_node in member_nodes:
            # Skip empty rows.
            if member_node.attrib["class"] == "dxgvEmptyDataRow":
                continue

            mtype = member_node.xpath("string(td[1])").strip()

            if not mtype:
                mtype = "member"

            member = member_node.xpath("string(td[3])").split()

            member = " ".join(member[1:])

            comm.add_member(member, role=mtype)

        for a in page.xpath(
            '//table[@id="ctl00_m_g_a194465c_f092_46df_b753_'
            '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'
        ):
            sub_name = a.text.strip()
            sub_url = a.get("href").replace("../", "/")
            self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name)

        if not comm._related:
            if subcommittee:
                self.warning("Not saving empty subcommittee {}.".format(subcommittee))
            else:
                self.warning("Not saving empty committee {}.".format(name))
        else:
            yield comm
Example #27
0
    def scrape_lower(self):
        url = self.urls["lower"]
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.base_urls["lower"])

        for type_ in ["Standing", "Select"]:

            if type_ == "Joint":
                _chamber = type_.lower()
            else:
                _chamber = "lower"

            for xpath in [
                    '//div[contains(@class, "view-view-%sCommittee")]' % type_,
                    '//div[contains(@id, "block-views-view_StandingCommittee-block_1")]',
                    '//div[contains(@class, "views-field-title")]',
            ]:
                div = doc.xpath(xpath)
                if div:
                    break

            div = div[0]
            committees = div.xpath(
                'descendant::span[@class="field-content"]/a/text()')
            committees = map(strip, committees)
            urls = div.xpath(
                'descendant::span[@class="field-content"]/a/@href')

            for c, _url in zip(committees, urls):

                c = c.replace("Committee on ", "").replace(" Committee", "")
                org = Organization(name=c,
                                   chamber=_chamber,
                                   classification="committee")
                self.info(u"Saving {} committee.".format(c))
                org.add_source(_url)
                org.add_source(url)
                for member, role in self.scrape_lower_members(_url):
                    org.add_member(member, role)

                _found = False
                if not org._related:
                    try:
                        for member, role in self.scrape_lower_members(
                                _url + "/membersstaff"):
                            _found = True
                            org.add_member(member, role)
                        if _found:
                            source = _url + "/membersstaff"
                            org.add_source(source)
                    except requests.exceptions.HTTPError:
                        self.error("Unable to access member list for {} "
                                   "committee.".format(c))

                if org._related:
                    yield org
                else:
                    self.warning(
                        "No members found for {} committee.".format(c))

        # Subcommittees
        div = doc.xpath('//div[contains(@class, "view-view-SubCommittee")]')[0]
        for subcom in div.xpath('div/div[@class="item-list"]'):
            committee = self.get_node(subcom, "h4/text()")

            if committee is None:
                continue

            names = subcom.xpath("descendant::a/text()")
            names = map(strip, names)
            urls = subcom.xpath("descendant::a/@href")
            for n, _url in zip(names, urls):
                n = re.search(r"^Subcommittee.*?on (.*)$", n).group(1)
                org = Organization(
                    name=n,
                    parent="lower",
                    classification="committee",
                    parent_id={
                        "name": committee,
                        "classification": "lower"
                    },
                )
                org.add_source(_url)
                org.add_source(url)

                for member, role in self.scrape_lower_members(_url):
                    org.add_member(member, role)

                _found = False
                if not org._related:
                    try:
                        for member, role in self.scrape_lower_members(
                                _url + "/membersstaff"):
                            _found = True
                            org.add_member(member, role)
                        if _found:
                            source = _url + "/membersstaff"
                            org.add_source(source)
                    except requests.exceptions.HTTPError:
                        self.error(
                            "Unable to access member list for {} subcommittee."
                            .format(org.name))

                if org._related:
                    yield org
                else:
                    self.warning("No members found for {} subcommittee of {} "
                                 "committee".format(org.name, org._related))
    def scrape_joint_committee(self, committee_name, url):
        if "state.tn.us" in url:
            com = Organization(committee_name,
                               chamber="legislature",
                               classification="committee")
            try:
                page = self.get(url).text
            except requests.exceptions.ConnectionError:
                self.logger.warning("Committee link is broken, skipping")
                return

            page = lxml.html.fromstring(page)

            for el in page.xpath(
                    "//div[@class='Blurb']/table//tr[2 <= position() and  position() < 10]/td[1]"
            ):
                if el.xpath("text()") == ["Vacant"]:
                    continue

                (member_name, ) = el.xpath("a/text()")
                if el.xpath("text()"):
                    role = el.xpath("text()")[0].strip(" ,")
                else:
                    role = "member"

                member_name = member_name.replace("Senator", "")
                member_name = member_name.replace("Representative", "")
                member_name = member_name.strip()
                com.add_member(member_name, role)

            com.add_link(url)
            com.add_source(url)
            return com

        elif "gov-opps" in url:
            com = Organization(committee_name,
                               chamber="legislature",
                               classification="committee")
            page = self.get(url).text
            page = lxml.html.fromstring(page)

            links = ["senate", "house"]
            for link in links:
                chamber_link = self.base_href + "/" + link + "/committees/gov-opps.html"
                chamber_page = self.get(chamber_link).text
                chamber_page = lxml.html.fromstring(chamber_page)

                OFFICER_SEARCH = (
                    '//h2[contains(text(), "Committee Officers")]/'
                    "following-sibling::div/ul/li/a")
                MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/'
                                 "following-sibling::div/ul/li/a")
                for a in chamber_page.xpath(
                        OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH):
                    member_name = " ".join(
                        [x.strip() for x in a.xpath(".//text()") if x.strip()])
                    role = a.xpath("small")
                    if role:
                        role = role[0].xpath("text()")[0].strip()
                        member_name = member_name.replace(role, "").strip()
                    else:
                        role = "member"
                    com.add_member(member_name, role)

                com.add_source(chamber_link)

            com.add_link(url)
            com.add_source(url)
            return com

        else:
            return self._scrape_committee(committee_name, url, "legislature")
 def get_organizations(self):
     yield Organization("Unicameral Legislature",
                        classification="legislature")
def test_full_organization():
    create_jurisdictions()
    org = ScrapeOrganization("United Nations", classification="international")
    org.add_identifier("un")
    org.add_name("UN", start_date="1945")
    org.add_contact_detail(type="phone",
                           value="555-555-1234",
                           note="this is fake")
    org.add_link("http://example.com/link")
    org.add_source("http://example.com/source")

    # import org
    od = org.as_dict()
    OrganizationImporter("jid1").import_data([od])

    # get person from db and assert it imported correctly
    o = Organization.objects.get()
    assert "ocd-organization" in o.id
    assert o.name == org.name

    assert o.identifiers.all()[0].identifier == "un"
    assert o.identifiers.all()[0].scheme == ""

    assert o.other_names.all()[0].name == "UN"
    assert o.other_names.all()[0].start_date == "1945"

    assert o.contact_details.all()[0].type == "phone"
    assert o.contact_details.all()[0].value == "555-555-1234"
    assert o.contact_details.all()[0].note == "this is fake"

    assert o.links.all()[0].url == "http://example.com/link"
    assert o.sources.all()[0].url == "http://example.com/source"