def test_full_organization():
    create_jurisdictions()
    org = ScrapeOrganization("United Nations", classification="international")
    org.add_identifier("un")
    org.add_name("UN", start_date="1945")
    org.add_contact_detail(type="phone",
                           value="555-555-1234",
                           note="this is fake")
    org.add_link("http://example.com/link")
    org.add_source("http://example.com/source")

    # import org
    od = org.as_dict()
    OrganizationImporter("jid1").import_data([od])

    # get person from db and assert it imported correctly
    o = Organization.objects.get()
    assert "ocd-organization" in o.id
    assert o.name == org.name

    assert o.identifiers.all()[0].identifier == "un"
    assert o.identifiers.all()[0].scheme == ""

    assert o.other_names.all()[0].name == "UN"
    assert o.other_names.all()[0].start_date == "1945"

    assert o.contact_details.all()[0].type == "phone"
    assert o.contact_details.all()[0].value == "555-555-1234"
    assert o.contact_details.all()[0].note == "this is fake"

    assert o.links.all()[0].url == "http://example.com/link"
    assert o.sources.all()[0].url == "http://example.com/source"
Example #2
0
    def scrape(self):
        com_url = "http://dccouncil.us/committees"
        data = self.get(com_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)

        comms = set(doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]'))

        for committee in comms:
            url = committee.attrib["href"]
            name = committee.text_content().strip()
            comm_data = self.get(url).text
            comm_page = lxml.html.fromstring(comm_data)
            comm_page.make_links_absolute(url)

            # classify these as belonging to the legislature
            committee = Organization(
                name=name, classification="committee", chamber="legislature"
            )

            if comm_page.xpath('//p[@class="page-summary"]'):
                summary = (
                    comm_page.xpath('//p[@class="page-summary"]')[0]
                    .text_content()
                    .strip()
                )
                committee.extras["summary"] = summary

            chair = comm_page.xpath("//h4[text()='Chairperson']/following-sibling::p")
            chair_name = chair[0].text_content().strip()
            chair_name = self.remove_title(chair_name)
            committee.add_member(chair_name, role="chair")

            members = comm_page.xpath(
                "//h4[text()='Councilmembers']/following-sibling::ul"
            )
            members = members[0].xpath("./li")

            for m in members:
                mem_name = m.text_content().strip()
                mem_name = self.remove_title(mem_name)
                if mem_name != chair_name:
                    committee.add_member(mem_name)

            committee.add_source(url)
            committee.add_link(url, note="Official Website")

            if not committee._related:
                self.warning("empty committee: %s;", name)
            else:
                yield committee
    def _scrape_committee(self, committee_name, link, chamber):
        """Scrape individual committee page and add members"""

        page = self.get(link).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(link)

        is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]'))
        if is_subcommittee:
            # All TN subcommittees are just the name of the parent committee with " Subcommittee"
            # at the end
            parent_committee_name = re.sub(r"\s*(Study )?Subcommittee\s*", "",
                                           committee_name)
            com = Organization(
                committee_name,
                classification="committee",
                parent_id=self.parents[parent_committee_name],
            )
        else:
            com = Organization(committee_name,
                               chamber=chamber,
                               classification="committee")
            self.parents[committee_name] = com._id

        OFFICER_SEARCH = ('//h2[contains(text(), "Committee Officers")]/'
                          "following-sibling::div/ul/li/a")
        MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/'
                         "following-sibling::div/ul/li/a")
        for a in page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH):

            member_name = " ".join([
                x.strip() for x in a.xpath("text()") + a.xpath("span/text()")
                if x.strip()
            ])
            role = a.xpath("small")
            if role:
                role = role[0].xpath("text()")[0].strip()
            else:
                role = "member"
            if "(Vacant)" in role:
                continue

            com.add_member(member_name, role)

        com.add_link(link)
        com.add_source(link)
        return com
    def scrape(self, chamber=None):
        committees_url = "http://le.utah.gov/data/committees.json"
        committees = self.get(committees_url).json()["committees"]

        people_url = "http://le.utah.gov/data/legislators.json"
        people = self.get(people_url).json()["legislators"]

        # The committee JSON only has legislator IDs, not names
        ids_to_names = {}
        for person in people:
            ids_to_names[person["id"]] = person["formatName"]

        for committee in committees:
            name = committee["description"]
            if name.endswith(" Committee"):
                name = name[: len(name) - len(" Committee")]
            elif name.endswith(" Subcommittee"):
                name = name[: len(name) - len(" Subcommittee")]
            if name.startswith("House "):
                name = name[len("House ") :]
                chamber = "lower"
            elif name.startswith("Senate "):
                name = name[len("Senate ") :]
                chamber = "upper"
            else:
                chamber = "legislature"

            c = Organization(chamber=chamber, name=name, classification="committee")
            c.add_source(committees_url)
            c.add_source(people_url)
            c.add_link(committee["link"])

            for member in committee["members"]:
                try:
                    member_name = ids_to_names[member["id"]]
                except KeyError:
                    self.warning(
                        "Found unknown legislator ID in committee JSON: " + member["id"]
                    )
                c.add_member(member_name, role=member["position"])

            yield c
    def scrape_joint_committee(self, committee_name, url):
        if "state.tn.us" in url:
            com = Organization(committee_name,
                               chamber="legislature",
                               classification="committee")
            try:
                page = self.get(url).text
            except requests.exceptions.ConnectionError:
                self.logger.warning("Committee link is broken, skipping")
                return

            page = lxml.html.fromstring(page)

            for el in page.xpath(
                    "//div[@class='Blurb']/table//tr[2 <= position() and  position() < 10]/td[1]"
            ):
                if el.xpath("text()") == ["Vacant"]:
                    continue

                (member_name, ) = el.xpath("a/text()")
                if el.xpath("text()"):
                    role = el.xpath("text()")[0].strip(" ,")
                else:
                    role = "member"

                member_name = member_name.replace("Senator", "")
                member_name = member_name.replace("Representative", "")
                member_name = member_name.strip()
                com.add_member(member_name, role)

            com.add_link(url)
            com.add_source(url)
            return com

        elif "gov-opps" in url:
            com = Organization(committee_name,
                               chamber="legislature",
                               classification="committee")
            page = self.get(url).text
            page = lxml.html.fromstring(page)

            links = ["senate", "house"]
            for link in links:
                chamber_link = self.base_href + "/" + link + "/committees/gov-opps.html"
                chamber_page = self.get(chamber_link).text
                chamber_page = lxml.html.fromstring(chamber_page)

                OFFICER_SEARCH = (
                    '//h2[contains(text(), "Committee Officers")]/'
                    "following-sibling::div/ul/li/a")
                MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/'
                                 "following-sibling::div/ul/li/a")
                for a in chamber_page.xpath(
                        OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH):
                    member_name = " ".join(
                        [x.strip() for x in a.xpath(".//text()") if x.strip()])
                    role = a.xpath("small")
                    if role:
                        role = role[0].xpath("text()")[0].strip()
                        member_name = member_name.replace(role, "").strip()
                    else:
                        role = "member"
                    com.add_member(member_name, role)

                com.add_source(chamber_link)

            com.add_link(url)
            com.add_source(url)
            return com

        else:
            return self._scrape_committee(committee_name, url, "legislature")