Ejemplo n.º 1
0
    def get_organizations(self):

        # Initialize the Organization class. Use keyword args to set the basic
        # properties.
        secretary_of_state = Organization(
            name="Office of the Secretary of State, State of Arizona",
            classification="office"
        )

        #  
        secretary_of_state.add_contact_detail(
            type="voice",
            value="602-542-4285"
        )

        secretary_of_state.add_contact_detail(
            type="address",
            value="1700 W Washington St Fl 7, Phoenix AZ 85007-2808"
        )
        secretary_of_state.add_link(
            url="http://www.azsos.gov/",
            note="Home page"
        )

        self._secretary_of_state = secretary_of_state

        yield secretary_of_state
Ejemplo n.º 2
0
def test_full_organization():
    org = ScrapeOrganization('United Nations', classification='international')
    org.add_identifier('un')
    org.add_name('UN', start_date='1945')
    org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
    org.add_link('http://example.com/link')
    org.add_source('http://example.com/source')

    # import org
    od = org.as_dict()
    OrganizationImporter('jurisdiction-id').import_data([od])

    # get person from db and assert it imported correctly
    o = Organization.objects.get()
    assert 'ocd-organization' in o.id
    assert o.name == org.name

    assert o.identifiers.all()[0].identifier == 'un'
    assert o.identifiers.all()[0].scheme == ''

    assert o.other_names.all()[0].name == 'UN'
    assert o.other_names.all()[0].start_date == '1945'

    assert o.contact_details.all()[0].type == 'phone'
    assert o.contact_details.all()[0].value == '555-555-1234'
    assert o.contact_details.all()[0].note == 'this is fake'

    assert o.links.all()[0].url == 'http://example.com/link'
    assert o.sources.all()[0].url == 'http://example.com/source'
Ejemplo n.º 3
0
    def scrape_committees(self, repos):
        for repo in repos:
            source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format(repo)
            committees = self.fetch_yaml(source)
            for committee in committees:
                org = Organization(committee["name"], classification="committee")

                org.add_source(source)

                for key in committee.keys() & {"url", "rss_url"}:
                    org.add_link(committee[key])

                for key in committee.keys() & {"phone", "address"}:
                    org.add_contact_detail(
                        type="voice", value=committee[key]
                    ) if key == "phone" else org.add_contact_detail(type=key, value=committee[key])

                for key in committee.keys() & {"senate_committee_id", "house_committee_id", "thomas_id"}:
                    org.add_identifier(committee[key], scheme=key)

                if "subcommittees" in committee:
                    for subcommittee in committee["subcommittees"]:
                        sub_org = Organization(subcommittee["name"], classification="committee", parent_id=org._id)

                        sub_org.add_identifier(subcommittee["thomas_id"], scheme="thomas")
                        sub_org.add_source(source)

                        for key in subcommittee.keys() & {"phone", "address"}:
                            sub_org.add_contact_detail(
                                type="voice", value=committee[key]
                            ) if key == "phone" else sub_org.add_contact_detail(type=key, value=committee[key])

                        yield sub_org

                yield org
Ejemplo n.º 4
0
def test_full_organization():
    create_jurisdictions()
    org = ScrapeOrganization('United Nations', classification='international')
    org.add_identifier('un')
    org.add_name('UN', start_date='1945')
    org.add_contact_detail(type='phone',
                           value='555-555-1234',
                           note='this is fake')
    org.add_link('http://example.com/link')
    org.add_source('http://example.com/source')

    # import org
    od = org.as_dict()
    OrganizationImporter('jid1').import_data([od])

    # get person from db and assert it imported correctly
    o = Organization.objects.get()
    assert 'ocd-organization' in o.id
    assert o.name == org.name

    assert o.identifiers.all()[0].identifier == 'un'
    assert o.identifiers.all()[0].scheme == ''

    assert o.other_names.all()[0].name == 'UN'
    assert o.other_names.all()[0].start_date == '1945'

    assert o.contact_details.all()[0].type == 'phone'
    assert o.contact_details.all()[0].value == '555-555-1234'
    assert o.contact_details.all()[0].note == 'this is fake'

    assert o.links.all()[0].url == 'http://example.com/link'
    assert o.sources.all()[0].url == 'http://example.com/source'
Ejemplo n.º 5
0
    def scrape(self):
        com_url = "http://dccouncil.us/committees"
        data = self.get(com_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)

        comms = set(doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]'))

        for committee in comms:
            url = committee.attrib["href"]
            name = committee.text_content().strip()
            comm_data = self.get(url).text
            comm_page = lxml.html.fromstring(comm_data)
            comm_page.make_links_absolute(url)

            # classify these as belonging to the legislature
            committee = Organization(
                name=name, classification="committee", chamber="legislature"
            )

            if comm_page.xpath('//p[@class="page-summary"]'):
                summary = (
                    comm_page.xpath('//p[@class="page-summary"]')[0]
                    .text_content()
                    .strip()
                )
                committee.extras["summary"] = summary

            chair = comm_page.xpath("//h4[text()='Chairperson']/following-sibling::p")
            chair_name = chair[0].text_content().strip()
            chair_name = self.remove_title(chair_name)
            committee.add_member(chair_name, role="chair")

            members = comm_page.xpath(
                "//h4[text()='Councilmembers']/following-sibling::ul"
            )
            members = members[0].xpath("./li")

            for m in members:
                mem_name = m.text_content().strip()
                mem_name = self.remove_title(mem_name)
                if mem_name != chair_name:
                    committee.add_member(mem_name)

            committee.add_source(url)
            committee.add_link(url, note="Official Website")

            if not committee._related:
                self.warning("empty committee: %s;", name)
            else:
                yield committee
Ejemplo n.º 6
0
    def _scrape_committee(self, committee_name, link, chamber):
        """Scrape individual committee page and add members"""

        page = self.get(link).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(link)

        is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]'))
        if is_subcommittee:
            # All TN subcommittees are just the name of the parent committee with " Subcommittee"
            # at the end
            parent_committee_name = re.sub(r'\s*(Study )?Subcommittee\s*', '', committee_name)
            com = Organization(
                    committee_name,
                    classification='committee',
                    parent_id=self.parents[parent_committee_name]
                    )
        else:
            com = Organization(
                committee_name,
                chamber=chamber,
                classification='committee',
            )
            self.parents[committee_name] = com._id

        OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
                         'following-sibling::div/ul/li/a'
        MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
                        'following-sibling::div/ul/li/a'
        for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)):

            member_name = ' '.join([
                    x.strip() for x in
                    a.xpath('text()') + a.xpath('span/text()')
                    if x.strip()
                    ])
            role = a.xpath('small')
            if role:
                role = role[0].xpath('text()')[0].strip()
            else:
                role = 'member'
            if '(Vacant)' in role:
                continue

            com.add_member(member_name, role)

        com.add_link(link)
        com.add_source(link)
        return com
    def _scrape_committee(self, committee_name, link, chamber):
        """Scrape individual committee page and add members"""

        page = self.get(link).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(link)

        is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]'))
        if is_subcommittee:
            # All TN subcommittees are just the name of the parent committee with " Subcommittee"
            # at the end
            parent_committee_name = re.sub(r'\s*Subcommittee\s*', '',
                                           committee_name)
            com = Organization(
                committee_name,
                classification='committee',
                parent_id={
                    'name': parent_committee_name,
                    'classification': chamber
                },
            )
        else:
            com = Organization(
                committee_name,
                chamber=chamber,
                classification='committee',
            )

        OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
                         'following-sibling::div/ul/li/a'
        MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
                        'following-sibling::div/ul/li/a'
        for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)):

            member_name = ' '.join([
                x.strip() for x in a.xpath('text()') + a.xpath('span/text()')
                if x.strip()
            ])
            role = a.xpath('small')
            if role:
                role = role[0].xpath('text()')[0].strip()
            else:
                role = 'member'

            com.add_member(member_name, role)

        com.add_link(link)
        com.add_source(link)
        return com
Ejemplo n.º 8
0
    def scrape(self):
        com_url = 'http://dccouncil.us/committees'
        data = self.get(com_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)

        comms = set(
            doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]'))

        for committee in comms:
            url = committee.attrib['href']
            name = committee.text_content().strip()
            comm_data = self.get(url).text
            comm_page = lxml.html.fromstring(comm_data)
            comm_page.make_links_absolute(url)

            # classify these as belonging to the legislature
            committee = Organization(name=name, classification='committee',
                                     chamber='legislature')

            if comm_page.xpath('//p[@class="page-summary"]'):
                summary = comm_page.xpath(
                    '//p[@class="page-summary"]')[0].text_content().strip()
                committee.extras['summary'] = summary

            chair = comm_page.xpath(
                "//h4[text()='Chairperson']/following-sibling::p")
            chair_name = chair[0].text_content().strip()
            chair_name = self.remove_title(chair_name)
            committee.add_member(chair_name, role="chair")

            members = comm_page.xpath(
                "//h4[text()='Councilmembers']/following-sibling::ul")
            members = members[0].xpath("./li")

            for m in members:
                mem_name = m.text_content().strip()
                mem_name = self.remove_title(mem_name)
                if mem_name != chair_name:
                    committee.add_member(mem_name)

            committee.add_source(url)
            committee.add_link(url, note='Official Website')

            if not committee._related:
                self.warning('empty committee: %s;', name)
            else:
                yield committee
Ejemplo n.º 9
0
    def get_organizations(self):
        secretary_of_state = Organization(
            name="Office of the Secretary of State, State of California",
            classification="office")

        secretary_of_state.add_contact_detail(type="voice",
                                              value="916-653-6814")

        secretary_of_state.add_contact_detail(
            type="address", value="1500 11th Street, Sacramento, CA 95814")

        secretary_of_state.add_link(url="http://www.sos.ca.gov",
                                    note="Home page")

        self._secretary_of_state = secretary_of_state
        yield secretary_of_state
Ejemplo n.º 10
0
    def _scrape_committee(self, committee_name, link, chamber):
        """Scrape individual committee page and add members"""

        page = self.get(link).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(link)

        is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]'))
        if is_subcommittee:
            # All TN subcommittees are just the name of the parent committee with " Subcommittee"
            # at the end
            parent_committee_name = re.sub(r"\s*(Study )?Subcommittee\s*", "",
                                           committee_name)
            com = Organization(
                committee_name,
                classification="committee",
                parent_id=self.parents[parent_committee_name],
            )
        else:
            com = Organization(committee_name,
                               chamber=chamber,
                               classification="committee")
            self.parents[committee_name] = com._id

        OFFICER_SEARCH = ('//h2[contains(text(), "Committee Officers")]/'
                          "following-sibling::div/ul/li/a")
        MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/'
                         "following-sibling::div/ul/li/a")
        for a in page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH):

            member_name = " ".join([
                x.strip() for x in a.xpath("text()") + a.xpath("span/text()")
                if x.strip()
            ])
            role = a.xpath("small")
            if role:
                role = role[0].xpath("text()")[0].strip()
            else:
                role = "member"
            if "(Vacant)" in role:
                continue

            com.add_member(member_name, role)

        com.add_link(link)
        com.add_source(link)
        return com
Ejemplo n.º 11
0
    def scrape(self, chamber=None):
        committees_url = 'http://le.utah.gov/data/committees.json'
        committees = self.get(committees_url).json()['committees']

        people_url = 'http://le.utah.gov/data/legislators.json'
        people = self.get(people_url).json()['legislators']

        # The committee JSON only has legislator IDs, not names
        ids_to_names = {}
        for person in people:
            ids_to_names[person['id']] = person['formatName']

        for committee in committees:
            name = committee['description']
            if name.endswith(' Committee'):
                name = name[:len(name) - len(' Committee')]
            elif name.endswith(' Subcommittee'):
                name = name[:len(name) - len(' Subcommittee')]
            if name.startswith('House '):
                name = name[len('House '):]
                chamber = 'lower'
            elif name.startswith('Senate '):
                name = name[len('Senate '):]
                chamber = 'upper'
            else:
                chamber = 'legislature'

            c = Organization(
                chamber=chamber,
                name=name,
                classification='committee'
            )
            c.add_source(committees_url)
            c.add_source(people_url)
            c.add_link(committee['link'])

            for member in committee['members']:
                try:
                    member_name = ids_to_names[member['id']]
                except KeyError:
                    self.warning(
                        "Found unknown legislator ID in committee JSON: " +
                        member['id']
                    )
                c.add_member(member_name, role=member['position'])

            yield c
Ejemplo n.º 12
0
    def scrape_committees(self, repos):
        for repo in repos:
            source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format(
                repo)
            committees = self.fetch_yaml(source)
            for committee in committees:
                org = Organization(committee['name'],
                                   classification='committee')

                org.add_source(source)

                for key in committee.keys() & {'url', 'rss_url'}:
                    org.add_link(committee[key])

                for key in committee.keys() & {'phone', 'address'}:
                    org.add_contact_detail(
                        type='voice', value=committee[key]
                    ) if key == 'phone' else org.add_contact_detail(
                        type=key, value=committee[key])

                for key in committee.keys() & {
                        'senate_committee_id', 'house_committee_id',
                        'thomas_id'
                }:
                    org.add_identifier(committee[key], scheme=key)

                if 'subcommittees' in committee:
                    for subcommittee in committee['subcommittees']:
                        sub_org = Organization(subcommittee['name'],
                                               classification="committee",
                                               parent_id=org._id)

                        sub_org.add_identifier(subcommittee['thomas_id'],
                                               scheme="thomas")
                        sub_org.add_source(source)

                        for key in subcommittee.keys() & {'phone', 'address'}:
                            sub_org.add_contact_detail(
                                type='voice', value=committee[key]
                            ) if key == 'phone' else sub_org.add_contact_detail(
                                type=key, value=committee[key])

                        yield sub_org

                yield org
Ejemplo n.º 13
0
    def scrape(self, chamber=None):
        committees_url = 'http://le.utah.gov/data/committees.json'
        committees = self.get(committees_url).json()['committees']

        people_url = 'http://le.utah.gov/data/legislators.json'
        people = self.get(people_url).json()['legislators']

        # The committee JSON only has legislator IDs, not names
        ids_to_names = {}
        for person in people:
            ids_to_names[person['id']] = person['formatName']

        for committee in committees:
            name = committee['description']
            if name.endswith(' Committee'):
                name = name[:len(name) - len(' Committee')]
            elif name.endswith(' Subcommittee'):
                name = name[:len(name) - len(' Subcommittee')]
            if name.startswith('House '):
                name = name[len('House '):]
                chamber = 'lower'
            elif name.startswith('Senate '):
                name = name[len('Senate '):]
                chamber = 'upper'
            else:
                chamber = 'legislature'

            c = Organization(chamber=chamber,
                             name=name,
                             classification='committee')
            c.add_source(committees_url)
            c.add_source(people_url)
            c.add_link(committee['link'])

            for member in committee['members']:
                try:
                    member_name = ids_to_names[member['id']]
                except KeyError:
                    self.warning(
                        "Found unknown legislator ID in committee JSON: " +
                        member['id'])
                c.add_member(member_name, role=member['position'])

            yield c
Ejemplo n.º 14
0
    def scrape(self, chamber=None):
        committees_url = "http://le.utah.gov/data/committees.json"
        committees = self.get(committees_url).json()["committees"]

        people_url = "http://le.utah.gov/data/legislators.json"
        people = self.get(people_url).json()["legislators"]

        # The committee JSON only has legislator IDs, not names
        ids_to_names = {}
        for person in people:
            ids_to_names[person["id"]] = person["formatName"]

        for committee in committees:
            name = committee["description"]
            if name.endswith(" Committee"):
                name = name[: len(name) - len(" Committee")]
            elif name.endswith(" Subcommittee"):
                name = name[: len(name) - len(" Subcommittee")]
            if name.startswith("House "):
                name = name[len("House ") :]
                chamber = "lower"
            elif name.startswith("Senate "):
                name = name[len("Senate ") :]
                chamber = "upper"
            else:
                chamber = "legislature"

            c = Organization(chamber=chamber, name=name, classification="committee")
            c.add_source(committees_url)
            c.add_source(people_url)
            c.add_link(committee["link"])

            for member in committee["members"]:
                try:
                    member_name = ids_to_names[member["id"]]
                except KeyError:
                    self.warning(
                        "Found unknown legislator ID in committee JSON: " + member["id"]
                    )
                c.add_member(member_name, role=member["position"])

            yield c
Ejemplo n.º 15
0
    def get_organizations(self):

        secretary_of_the_commonwealth = Organization(
            name="Office of the Secretary of the Commonwealth, Commonwealth of Virginia",
            classification="office"
        )
        secretary_of_the_commonwealth.add_contact_detail(
            type="voice",
            value="804-786-2441"
        )
        secretary_of_the_commonwealth.add_contact_detail(
            type="address",
            value="1111 East Broad Street, 4th Floor, Richmond, Virginia 23219"
        )
        secretary_of_the_commonwealth.add_link(
            url="https://commonwealth.virginia.gov/",
            note="Home page"
        )

        self._secretary_of_the_commonwealth = secretary_of_the_commonwealth

        yield secretary_of_the_commonwealth
Ejemplo n.º 16
0
    def get_organizations(self):
        secretary_of_state = Organization(                                    
            name="Office of the Secretary of State, State of California",        
            classification="office"                                           
        )

        secretary_of_state.add_contact_detail(                                
            type="voice",                                                     
            value="916-653-6814"
        )                    

        secretary_of_state.add_contact_detail(                                
            type="address",                                                   
            value="1500 11th Street, Sacramento, CA 95814"
        )
                                                                     
        secretary_of_state.add_link(                                          
            url="http://www.sos.ca.gov",                                      
            note="Home page"                                                  
        )

        self._secretary_of_state = secretary_of_state
        yield secretary_of_state
Ejemplo n.º 17
0
    def scrape_joint_committee(self, committee_name, url):
        if "state.tn.us" in url:
            com = Organization(committee_name,
                               chamber="legislature",
                               classification="committee")
            try:
                page = self.get(url).text
            except requests.exceptions.ConnectionError:
                self.logger.warning("Committee link is broken, skipping")
                return

            page = lxml.html.fromstring(page)

            for el in page.xpath(
                    "//div[@class='Blurb']/table//tr[2 <= position() and  position() < 10]/td[1]"
            ):
                if el.xpath("text()") == ["Vacant"]:
                    continue

                (member_name, ) = el.xpath("a/text()")
                if el.xpath("text()"):
                    role = el.xpath("text()")[0].strip(" ,")
                else:
                    role = "member"

                member_name = member_name.replace("Senator", "")
                member_name = member_name.replace("Representative", "")
                member_name = member_name.strip()
                com.add_member(member_name, role)

            com.add_link(url)
            com.add_source(url)
            return com

        elif "gov-opps" in url:
            com = Organization(committee_name,
                               chamber="legislature",
                               classification="committee")
            page = self.get(url).text
            page = lxml.html.fromstring(page)

            links = ["senate", "house"]
            for link in links:
                chamber_link = self.base_href + "/" + link + "/committees/gov-opps.html"
                chamber_page = self.get(chamber_link).text
                chamber_page = lxml.html.fromstring(chamber_page)

                OFFICER_SEARCH = (
                    '//h2[contains(text(), "Committee Officers")]/'
                    "following-sibling::div/ul/li/a")
                MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/'
                                 "following-sibling::div/ul/li/a")
                for a in chamber_page.xpath(
                        OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH):
                    member_name = " ".join(
                        [x.strip() for x in a.xpath(".//text()") if x.strip()])
                    role = a.xpath("small")
                    if role:
                        role = role[0].xpath("text()")[0].strip()
                        member_name = member_name.replace(role, "").strip()
                    else:
                        role = "member"
                    com.add_member(member_name, role)

                com.add_source(chamber_link)

            com.add_link(url)
            com.add_source(url)
            return com

        else:
            return self._scrape_committee(committee_name, url, "legislature")
Ejemplo n.º 18
0
    def scrape_joint_committee(self, committee_name, url):
        if 'state.tn.us' in url:
            com = Organization(
                committee_name,
                chamber='legislature',
                classification='committee',
            )
            try:
                page = self.get(url).text
            except requests.exceptions.ConnectionError:
                self.logger.warning("Committee link is broken, skipping")
                return

            page = lxml.html.fromstring(page)

            for el in page.xpath(
                    "//div[@class='Blurb']/table//tr[2 <= position() and  position() < 10]/td[1]"
            ):
                if el.xpath('text()') == ['Vacant']:
                    continue

                (member_name, ) = el.xpath('a/text()')
                if el.xpath('text()'):
                    role = el.xpath('text()')[0].strip(' ,')
                else:
                    role = 'member'

                member_name = member_name.replace('Senator', '')
                member_name = member_name.replace('Representative', '')
                member_name = member_name.strip()
                com.add_member(member_name, role)

            com.add_link(url)
            com.add_source(url)
            return com

        elif 'gov-opps' in url:
            com = Organization(
                committee_name,
                chamber='legislature',
                classification='committee',
            )
            page = self.get(url).text
            page = lxml.html.fromstring(page)

            links = ['senate', 'house']
            for link in links:
                chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html'
                chamber_page = self.get(chamber_link).text
                chamber_page = lxml.html.fromstring(chamber_page)

                OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
                                 'following-sibling::div/ul/li/a'
                MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
                                'following-sibling::div/ul/li/a'
                for a in (chamber_page.xpath(OFFICER_SEARCH) +
                          chamber_page.xpath(MEMBER_SEARCH)):
                    member_name = ' '.join(
                        [x.strip() for x in a.xpath('.//text()') if x.strip()])
                    role = a.xpath('small')
                    if role:
                        role = role[0].xpath('text()')[0].strip()
                        member_name = member_name.replace(role, '').strip()
                    else:
                        role = 'member'
                    com.add_member(member_name, role)

                com.add_source(chamber_link)

            com.add_link(url)
            com.add_source(url)
            return com

        else:
            return self._scrape_committee(committee_name, url, 'legislature')
Ejemplo n.º 19
0
    def get_organizations(self):
        legislature = Organization("United States Congress",
                                   classification='legislature')

        self._legislature = legislature

        yield legislature

        senate = Organization(
            name="United States Senate",
            classification='upper',
            parent_id=legislature._id,
        )

        self._senate = senate

        yield senate

        house = Organization(
            name="United States House",
            classification='lower',
            parent_id=legislature._id,
        )

        self._house = house

        yield house

        sopr = Organization(
            name="Office of Public Record, US Senate",
            classification="office",
            parent_id=senate._id,
        )

        sopr.add_contact_detail(type="voice", value="202-224-0322")

        sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/"
                        "one_item_and_teasers/opr.htm",
                        note="Profile page")

        sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/"
                        "g_three_sections_with_teasers/lobbyingdisc.htm"
                        "#lobbyingdisc=lda",
                        note="Disclosure Home")

        sopr.add_link(url="http://soprweb.senate.gov/index.cfm"
                      "?event=selectfields",
                      note="Disclosure Search Portal")

        sopr.add_link(url="http://soprweb.senate.gov/",
                      note="Disclosure Electronic Filing System")

        self._sopr = sopr

        yield sopr

        house_clerk = Organization(
            name="Office of the Clerk, US House",
            classification="office",
            parent_id=house._id,
        )

        house_clerk.add_contact_detail(type="voice", value="202-225-7000")

        house_clerk.add_source(url="http://clerk.house.gov/", note="Home page")

        self._house_clerk = house_clerk

        yield house_clerk
        yield legislature
Ejemplo n.º 20
0
    def get_organizations(self):
        legislature = Organization("United States Congress",
                                   classification='legislature')

        self._legislature = legislature

        yield legislature

        senate = Organization(
            name="United States Senate",
            classification='upper',
            parent_id=legislature._id,
        )

        self._senate = senate

        yield senate

        house = Organization(
            name="United States House",
            classification='lower',
            parent_id=legislature._id,
        )

        self._house = house

        yield house

        sopr = Organization(
            name="Office of Public Record, US Senate",
            classification="office",
            parent_id=senate._id,
        )

        sopr.add_contact_detail(type="voice",
                                value="202-224-0322")

        sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/"
                            "one_item_and_teasers/opr.htm",
                        note="Profile page")

        sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/"
                            "g_three_sections_with_teasers/lobbyingdisc.htm"
                            "#lobbyingdisc=lda",
                        note="Disclosure Home")

        sopr.add_link(url="http://soprweb.senate.gov/index.cfm"
                          "?event=selectfields",
                      note="Disclosure Search Portal")

        sopr.add_link(url="http://soprweb.senate.gov/",
                      note="Disclosure Electronic Filing System")

        self._sopr = sopr

        yield sopr

        house_clerk = Organization(
            name="Office of the Clerk, US House",
            classification="office",
            parent_id=house._id,
        )

        house_clerk.add_contact_detail(type="voice",
                                       value="202-225-7000")

        house_clerk.add_source(url="http://clerk.house.gov/",
                               note="Home page")

        self._house_clerk = house_clerk

        yield house_clerk
        yield legislature
Ejemplo n.º 21
0
    def scrape_joint_committee(self, committee_name, url):
        if 'state.tn.us' in url:
            com = Organization(
                committee_name,
                chamber='joint',
                classification='committee',
            )
            try:
                page = self.get(url).text
            except requests.exceptions.ConnectionError:
                self.logger.warning("Committee link is broken, skipping")
                return

            page = lxml.html.fromstring(page)

            for el in page.xpath(
                "//div[@class='Blurb']/table//tr[2 <= position() and  position() < 10]/td[1]"
            ):
                if el.xpath('text()') == ['Vacant']:
                    continue

                (member_name, ) = el.xpath('a/text()')
                if el.xpath('text()'):
                    role = el.xpath('text()')[0].strip(' ,')
                else:
                    role = 'member'

                member_name = member_name.replace('Senator', '')
                member_name = member_name.replace('Representative', '')
                member_name = member_name.strip()
                com.add_member(member_name, role)

            com.add_link(url)
            com.add_source(url)
            return com

        elif 'gov-opps' in url:
            com = Organization(
                committee_name,
                chamber='joint',
                classification='committee',
            )
            page = self.get(url).text
            page = lxml.html.fromstring(page)

            links = ['senate', 'house']
            for link in links:
                chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html'
                chamber_page = self.get(chamber_link).text
                chamber_page = lxml.html.fromstring(chamber_page)

                OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
                                 'following-sibling::div/ul/li/a'
                MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
                                'following-sibling::div/ul/li/a'
                for a in (
                        chamber_page.xpath(OFFICER_SEARCH) +
                        chamber_page.xpath(MEMBER_SEARCH)
                        ):
                    member_name = ' '.join([
                            x.strip() for x in
                            a.xpath('.//text()')
                            if x.strip()
                            ])
                    role = a.xpath('small')
                    if role:
                        role = role[0].xpath('text()')[0].strip()
                        member_name = member_name.replace(role, '').strip()
                    else:
                        role = 'member'
                    com.add_member(member_name, role)

                com.add_source(chamber_link)

            com.add_link(url)
            com.add_source(url)
            return com

        else:
            return self._scrape_committee(committee_name, url, 'joint')