Example #1
0
    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'], classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME)
                extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o
Example #2
0
    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(
            self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'],
                                     classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'],
                                     scheme=TWO_LETTER_ORG_CODE_SCHEME)
                extras['tmmis_decision_body_ids'].append(
                    {inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o
Example #3
0
def test_extras_organization():
    org = ScrapeOrganization('United Nations', classification='international')
    org.extras = {"hello": "world", "foo": {"bar": "baz"}}
    od = org.as_dict()
    OrganizationImporter('jurisdiction-id').import_data([od])
    o = Organization.objects.get()
    assert o.extras['foo']['bar'] == 'baz'
def test_extras_organization():
    org = ScrapeOrganization('United Nations', classification='international')
    org.extras = {"hello": "world",
                  "foo": {"bar": "baz"}}
    od = org.as_dict()
    OrganizationImporter('jurisdiction-id').import_data([od])
    o = Organization.objects.get()
    assert o.extras['foo']['bar'] == 'baz'
Example #5
0
    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(
            self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            # When there are no meetings scheduled and was no way to deduce committee code.
            if not code:
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'],
                                     classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'],
                                     scheme=TWO_LETTER_ORG_CODE_SCHEME)

                    # TODO: Scrape non-councillor members
                    meeting_id = self.referenceMeetingId(
                        inst['code'], inst['term'])
                    if meeting_id:
                        seen_posts = []
                        membership_url = MEMBERSHIP_URL_TEMPLATE.format(
                            meeting_id)
                        for councillor in self.councillorMembers(
                                membership_url):
                            o.add_member(councillor['name'],
                                         councillor['role'])
                            if councillor['role'] not in seen_posts:
                                # TODO: More specific divisions for some committee?
                                o.add_post(
                                    role=councillor['role'],
                                    label=councillor['role'],
                                    division_id=self.jurisdiction.division_id)
                                seen_posts.append(councillor['role'])

                extras['tmmis_decision_body_ids'].append(
                    {inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o
Example #6
0
    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            # When there are no meetings scheduled and was no way to deduce committee code.
            if not code:
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'], classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME)

                    # TODO: Scrape non-councillor members
                    meeting_id = self.referenceMeetingId(inst['code'], inst['term'])
                    if meeting_id:
                        seen_posts = []
                        membership_url = MEMBERSHIP_URL_TEMPLATE.format(meeting_id)
                        for councillor in self.councillorMembers(membership_url):
                            o.add_member(councillor['name'], councillor['role'])
                            if councillor['role'] not in seen_posts:
                                o.add_post(
                                    role=councillor['role'],
                                    label=councillor['role'],
                                    # TODO: More specific divisions for some committee?
                                    division_id=self.jurisdiction.division_id,
                                )
                                seen_posts.append(councillor['role'])

                extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o
Example #7
0
    def scrape_committee(self, comm_num):
        url = self.committee_url(comm_num)
        page = self.lxmlize(url)
        # get title
        comm_name = page.xpath("//h1/text()")[0]

        # create object
        comm = Organization(name=comm_name,
                            classification="committee",
                            chamber="legislature")
        comm.add_source(url=url)

        # add posts
        comm.add_post(label="chair", role="chair")
        # FIXME do we need a separate post for each member?
        # FIXME is member an appropriate name?
        comm.add_post(label="member", role="member")

        # helper for finding other nodes
        landmark_node = page.xpath("//h2[text()='Committee Members']")[0]

        # add memberships
        member_names = landmark_node.xpath(
            "following-sibling::div/ul/li/a/text()")
        fl_names = [HumanName.name_firstandlast(name) for name in member_names]
        print("My attempt to scrub people's names:",
              list(zip(member_names, fl_names)))
        chair_name, *other_names = fl_names
        if chair_name not in {'Lewis Reed'}:
            comm.add_member(chair_name, role="chair")
        for name in other_names:
            if name not in {'Lewis Reed'}:
                comm.add_member(name, role="member")

    # add description
        about_node = page.xpath("//h2[text()='About']")[0]
        (description, ) = about_node.xpath(
            "parent::div//div[@class='content-block']/p[2]/text()")
        description = description.strip()
        comm.extras = {"description": description}

        yield comm
Example #8
0
	def scrape_committee(self, comm_num):
                url = self.committee_url(comm_num)
                page = self.lxmlize(url)
                # get title
                comm_name = page.xpath("//h1/text()")[0]

                # create object
                comm = Organization(name=comm_name,
                                    classification="committee",
                                    chamber="legislature")
                comm.add_source(url=url)

		# add posts
                comm.add_post(label="chair", role="chair")
		# FIXME do we need a separate post for each member?
		# FIXME is member an appropriate name?
                comm.add_post(label="member", role="member") 

		# helper for finding other nodes
                landmark_node = page.xpath("//h2[text()='Committee Members']")[0]

		# add memberships
                member_names = landmark_node.xpath("following-sibling::div/ul/li/a/text()")
                fl_names = [HumanName.name_firstandlast(name) for name in member_names]
                print("My attempt to scrub people's names:", 
                      list(zip(member_names, fl_names)))
                chair_name, *other_names = fl_names
                if chair_name not in {'Lewis Reed'} :
                        comm.add_member(chair_name, role="chair")
                for name in other_names:
                        if name not in {'Lewis Reed'} :
                                comm.add_member(name, role="member")
		# add description 
                about_node = page.xpath("//h2[text()='About']")[0]
                (description, ) = about_node.xpath("parent::div//div[@class='content-block']/p[2]/text()")
                description = description.strip()
                comm.extras = {"description": description}

                yield comm
Example #9
0
    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        committees = backoff(self.cservice.GetCommitteesBySession, sid)

        # if committees.strip() == "":
        #    return  # If we get here, it's a problem.
        # Commenting this out for future debugging. - PRT

        if str(committees).strip() == "":
            raise ValueError("Error: No committee data for sid: %s" % (sid))

        committees = committees['CommitteeListing']
        for committee in committees:
            cid = committee['Id']
            committee = backoff(self.cservice.GetCommittee, cid)
            subctty_cache = {}

            comname, typ, guid, code, description = [committee[x] for x in [
                'Name', 'Type', 'Id', 'Code', 'Description'
            ]]
            comchamber = {
                "House": "lower",
                "Senate": "upper",
                "Joint": "joint"
            }[typ]
            ctty_key = '{}-{}'.format(typ, code)
            if ctty_key not in self.ctty_cache:
                ctty = Organization(chamber=comchamber, name=comname, classification='committee')
                ctty.extras = {
                    'code': code,
                    'guid': guid,
                    'description': description,
                }
                self.ctty_cache[ctty_key] = ctty

            members = committee['Members']['CommitteeMember']
            for member in members:
                name = "{First} {Last}".format(**dict(member['Member']['Name']))
                role = member['Role']
                membership = ctty.add_member(name, role)
                membership.extras = {'guid': member['Member']['Id']}
                subcoms = member['SubCommittees'] or []
                for subcom in subcoms:
                    subcom = subcom[1][0]
                    subguid = subcom['Id']
                    subcommittee = subcom['Name']
                    if subcommittee in subctty_cache:
                        # Add member to existing subcommittee.
                        subctty = subctty_cache[subcommittee]
                    else:
                        # Create subcommittee.
                        subctty = Organization(
                            name=subcommittee, classification='committee',
                            parent_id={'classification': comchamber, 'name': comname})
                        subctty.extras = {
                            'guid': subguid,
                        }
                        subctty.add_source(self.csource)
                        subctty.add_source(CTTIE_URL.format(**{
                            "sid": sid,
                            "cttie": guid,
                        }))
                        subctty_cache[subcommittee] = subctty
                    membership = subctty.add_member(name, role)
                    membership.extras = {'guid': member['Member']['Id']}

            for subctty in subctty_cache.values():
                yield subctty

            ctty.add_source(self.csource)
            ctty.add_source(CTTIE_URL.format(**{
                "sid": sid,
                "cttie": guid,
            }))
            yield ctty
Example #10
0
    def _scrape_lower_chamber(self, session):
        self.info('Scraping lower chamber for committees.')

        chamber = 'lower'

        url = '{base}ActiveCommittees.aspx'.format(base=self._reps_url_base)
        page_string = self.get(url).text
        page = lxml.html.fromstring(page_string)
        table = page.xpath('//div[@class="lightened"]/table[1]')[0]
        # Last tr has the date
        trs = table.xpath('tr')[:-1]
        for tr in trs:
            committee_parts = [part.strip()
                               for part in tr.text_content().split(',')]
            committee_name = committee_parts[0].title().strip()
            if len(committee_parts) > 0:
                status = committee_parts[1].strip()
            committee_url = tr.xpath('td/a')[0].attrib.get('href')
            committee_url = '{base}{url}'.format(base=self._reps_url_base,
                                                 url=committee_url)
            actual_chamber = chamber
            if 'joint' in committee_name.lower():
                actual_chamber = 'joint'

            committee_name = committee_name.replace('Committee On ', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Select', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Joint', '')
            committee_name = committee_name.replace(' Committee', '')
            committee_name = committee_name.strip()

            committee = Organization(
                committee_name,
                chamber=actual_chamber,
                classification='committee',
            )
            committee.extras = {'status': status}
            committee_page_string = self.get(committee_url).text
            committee_page = lxml.html.fromstring(committee_page_string)
            # First tr has the title (sigh)
            mem_trs = committee_page.xpath('id("memGroup")/tr')[1:]
            for mem_tr in mem_trs:
                mem_code = None
                mem_links = mem_tr.xpath('td/a[1]')
                if len(mem_links):
                    mem_code = mem_links[0].attrib.get('href')
                # Output is "Rubble, Barney, Neighbor"
                mem_parts = mem_tr.text_content().strip().split(',')
                if self._no_members_text in mem_parts:
                    continue
                mem_name = (mem_parts[1].strip() + ' ' +
                            mem_parts[0].strip())
                # Sometimes Senator abbreviation is in the name
                mem_name = mem_name.replace('Sen. ', '')
                mem_role = 'member'
                if len(mem_parts) > 2:
                    # Handle the case where there is a comma in the
                    # role name
                    mem_role = ', '.join(
                        [p.strip() for p in mem_parts[2:]]).lower()
                membership = committee.add_member(mem_name, role=mem_role)
                membership.extras = {'code': mem_code}

            committee.add_source(url)
            committee.add_source(committee_url)

            yield committee
Example #11
0
    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        committees = backoff(self.cservice.GetCommitteesBySession, sid)

        # if committees.strip() == "":
        #    return  # If we get here, it's a problem.
        # Commenting this out for future debugging. - PRT

        if str(committees).strip() == "":
            raise ValueError("Error: No committee data for sid: %s" % (sid))

        committees = committees['CommitteeListing']
        for committee in committees:
            cid = committee['Id']
            committee = backoff(self.cservice.GetCommittee, cid)
            subctty_cache = {}

            comname, typ, guid, code, description = [
                committee[x]
                for x in ['Name', 'Type', 'Id', 'Code', 'Description']
            ]
            comchamber = {
                "House": "lower",
                "Senate": "upper",
                "Joint": "joint"
            }[typ]
            ctty_key = '{}-{}'.format(typ, code)
            if ctty_key not in self.ctty_cache:
                ctty = Organization(chamber=comchamber,
                                    name=comname,
                                    classification='committee')
                ctty.extras = {
                    'code': code,
                    'guid': guid,
                    'description': description,
                }
                self.ctty_cache[ctty_key] = ctty

            members = committee['Members']['CommitteeMember']
            for member in members:
                name = "{First} {Last}".format(
                    **dict(member['Member']['Name']))
                role = member['Role']
                membership = ctty.add_member(name, role)
                membership.extras = {'guid': member['Member']['Id']}
                subcoms = member['SubCommittees'] or []
                for subcom in subcoms:
                    subcom = subcom[1][0]
                    subguid = subcom['Id']
                    subcommittee = subcom['Name']
                    if subcommittee in subctty_cache:
                        # Add member to existing subcommittee.
                        subctty = subctty_cache[subcommittee]
                    else:
                        # Create subcommittee.
                        subctty = Organization(name=subcommittee,
                                               classification='committee',
                                               parent_id={
                                                   'classification':
                                                   comchamber,
                                                   'name': comname
                                               })
                        subctty.extras = {
                            'guid': subguid,
                        }
                        subctty.add_source(self.csource)
                        subctty.add_source(
                            CTTIE_URL.format(**{
                                "sid": sid,
                                "cttie": guid,
                            }))
                        subctty_cache[subcommittee] = subctty
                    membership = subctty.add_member(name, role)
                    membership.extras = {'guid': member['Member']['Id']}

            for subctty in subctty_cache.values():
                yield subctty

            ctty.add_source(self.csource)
            ctty.add_source(CTTIE_URL.format(**{
                "sid": sid,
                "cttie": guid,
            }))
            yield ctty
Example #12
0
    def transform_parse(self, parsed_form, response):

        _source = {
            "url": response.url,
            "note": "LDA Form LD-1"
        }

        # basic disclosure fields
        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['datetimes']['signature_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification="lobbying"
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.add_identifier(
            identifier=parsed_form['_meta']['document_id'],
            scheme="urn:sopr:filing"
        )

        # disclosure extras
        _disclosure.extras = {}
        _disclosure.extras['registrant'] = {
            'self_employed_individual': parsed_form['registrant']['self_employed_individual'],
            'general_description': parsed_form['registrant']['registrant_general_description'],
            'signature': {
                "signature_date": parsed_form['datetimes']['signature_date'],
                "signature": parsed_form['signature']
            }
        }

        _disclosure.extras['client'] = {
            'same_as_registrant':
                parsed_form['client']['client_self'],
            'general_description':
                parsed_form['client']['client_general_description']
        }

        _disclosure.extras['registration_type'] = {
            'is_amendment':
                parsed_form['registration_type']['is_amendment'],
            'new_registrant':
                parsed_form['registration_type']['new_registrant'],
            'new_client_for_existing_registrant':
                parsed_form['registration_type'][
                    'new_client_for_existing_registrant'],
        }

        # # Registrant
        # build registrant
        _registrant_self_employment = None

        if parsed_form['registrant']['self_employed_individual']:
            n = ' '.join([p for p in [
                parsed_form['registrant']['registrant_individual_prefix'],
                parsed_form['registrant']['registrant_individual_firstname'],
                parsed_form['registrant']['registrant_individual_lastname']
            ] if len(p) > 0]).strip()

            _registrant = Person(
                name=n,
                source_identified=True
            )

            _registrant_self_employment = Organization(
                name='SELF-EMPLOYMENT of {n}'.format(n=n),
                classification='company',
                source_identified=True
            )

            _registrant.add_membership(
                organization=_registrant_self_employment,
                role='self_employed',
                label='self-employment of {n}'.format(n=n),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant = Organization(
                name=parsed_form['registrant']['registrant_org_name'],
                classification='company',
                source_identified=True
            )

        if len(parsed_form['registrant']['registrant_house_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_house_id'],
                scheme='urn:house_clerk:registrant'
            )

        if len(parsed_form['registrant']['registrant_senate_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_senate_id'],
                scheme='urn:sopr:registrant'
            )

        registrant_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['registrant']['registrant_address_one'],
                        parsed_form['registrant']['registrant_address_two'],
                        parsed_form['registrant']['registrant_city'],
                        parsed_form['registrant']['registrant_state'],
                        parsed_form['registrant']['registrant_zip'],
                        parsed_form['registrant']['registrant_country']]
                    if len(p) > 0]).strip(),
            },
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            },
        ]

        registrant_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['registrant']['registrant_ppb_city'],
                    parsed_form['registrant']['registrant_ppb_state'],
                    parsed_form['registrant']['registrant_ppb_zip'],
                    parsed_form['registrant']['registrant_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if registrant_contact_ppb["value"]:
            registrant_contact_details.append(registrant_contact_ppb)

        for cd in registrant_contact_details:
            _registrant.add_contact_detail(**cd)

        _registrant.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address_one",
                            "value": parsed_form['registrant'][
                                'registrant_address_one'],
                        },
                        {
                            "note": "address_two",
                            "value": parsed_form['registrant'][
                                'registrant_address_two'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_country'],
                        }
                    ],
                },
            ]
        }

        # # People
        # build contact
        _main_contact = Person(
            name=parsed_form['registrant']['registrant_contact_name'],
            source_identified=True
        )

        main_contact_contact_details = [
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            }
        ]

        for cd in main_contact_contact_details:
            _main_contact.add_contact_detail(**cd)

        if _registrant._type == 'organization':
            _registrant.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant_self_employment.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )

        # # Client
        # build client
        _client = Organization(
            name=parsed_form['client']['client_name'],
            classification='company',
            source_identified=True
        )

        client_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['client']['client_address'],
                        parsed_form['client']['client_city'],
                        parsed_form['client']['client_state'],
                        parsed_form['client']['client_zip'],
                        parsed_form['client']['client_country']]
                    if len(p) > 0]).strip(),
            },
        ]

        client_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['client']['client_ppb_city'],
                    parsed_form['client']['client_ppb_state'],
                    parsed_form['client']['client_ppb_zip'],
                    parsed_form['client']['client_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if client_contact_ppb["value"]:
            client_contact_details.append(client_contact_ppb)

        for cd in client_contact_details:
            _client.add_contact_detail(**cd)

        _client.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": parsed_form['client']['client_address'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client']['client_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client'][
                                'client_ppb_country'],
                        }
                    ],
                },
            ],
        }

        # Collect Foreign Entities
        _foreign_entities = []
        _foreign_entities_by_name = {}
        for fe in parsed_form['foreign_entities']:
            fe_extras = {}
            fe_name = fe['foreign_entity_name']

            # check for name-based duplicates
            if fe_name in _foreign_entities_by_name:
                _foreign_entity = _foreign_entities_by_name[fe_name]
            else:
                _foreign_entity = Organization(
                    name=fe_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            foreign_entity_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_address'],
                            fe['foreign_entity_city'],
                            fe['foreign_entity_state'],
                            fe['foreign_entity_country']]
                        if len(p) > 0]).strip(),
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_ppb_state'],
                            fe['foreign_entity_ppb_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            foreign_entity_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        fe['foreign_entity_ppb_city'],
                        fe['foreign_entity_ppb_state'],
                        fe['foreign_entity_ppb_country']]
                    if len(p) > 0]),
            }

            if foreign_entity_contact_ppb["value"]:
                foreign_entity_contact_details.append(
                    foreign_entity_contact_ppb)

            # add contact details
            for cd in foreign_entity_contact_details:
                if cd['value'] != '':
                    _foreign_entity.add_contact_detail(**cd)

            # add extras
            fe_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": fe['foreign_entity_address'],
                        },
                        {
                            "note": "city",
                            "value": fe['foreign_entity_city'],
                        },
                        {
                            "note": "state",
                            "value": fe['foreign_entity_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "state",
                            "value": fe['foreign_entity_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_ppb_country'],
                        }
                    ],
                },
            ]

            _foreign_entity.extras = combine_dicts(_foreign_entity.extras,
                                                   fe_extras)

            _foreign_entities_by_name[fe_name] = _foreign_entity

        for unique_foreign_entity in _foreign_entities_by_name.values():
            _foreign_entities.append(unique_foreign_entity)

            # TODO: add a variant on memberships to represent inter-org
            # relationships (associations, ownership, etc)
            #
            # _client['memberships'].append({
            #     "id": _foreign_entity['id'],
            #     "classification": "organization",
            #     "name": _foreign_entity['name'],
            #     "extras": {
            #         "ownership_percentage":
            #             fe['foreign_entity_amount']
            #     }
            # })

        # Collect Lobbyists
        # TODO: deal with wierd non-name line continuation cases (blanks, "continued")
        _lobbyists_by_name = {}

        for l in parsed_form['lobbyists']:
            l_extras = {}
            l_name = ' '.join([l['lobbyist_first_name'],
                               l['lobbyist_last_name'],
                               l['lobbyist_suffix']
                               ]).strip()

            if l_name in _lobbyists_by_name:
                _lobbyist = _lobbyists_by_name[l_name]
            else:
                _lobbyist = Person(
                    name=l_name,
                    source_identified=True
                )

            if l['lobbyist_covered_official_position']:
                l_extras['lda_covered_official_positions'] = [
                    {
                        'date_reported':
                            parsed_form['datetimes']['effective_date'],
                        'covered_official_position':
                            l['lobbyist_covered_official_position']
                    },
                ]

            _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras)

            _lobbyists_by_name[l_name] = _lobbyist

        _lobbyists = []
        for unique_lobbyist in _lobbyists_by_name.values():
            _lobbyists.append(unique_lobbyist)

        if _registrant._type == 'organization':
            for l in _lobbyists:
                _registrant.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )
        else:
            for l in _lobbyists:
                _registrant_self_employment.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )

        # # Document
        # build document
        _disclosure.add_document(
            note='submitted filing',
            date=parsed_form['datetimes']['effective_date'][:10],
            url=response.url
        )

        # Collect Affiliated orgs
        _affiliated_organizations = []
        _affiliated_organizations_by_name = {}
        for ao in parsed_form['affiliated_organizations']:
            ao_extras = {}
            ao_name = ao['affiliated_organization_name']
            if ao_name in _affiliated_organizations_by_name:
                # There's already one by this name
                _affiliated_organization = _affiliated_organizations_by_name[ao_name]
            else:
                # New affiliated org
                _affiliated_organization = Organization(
                    name=ao_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            affiliated_organization_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            ao['affiliated_organization_address'],
                            ao['affiliated_organization_city'],
                            ao['affiliated_organization_state'],
                            ao['affiliated_organization_zip'],
                            ao['affiliated_organization_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            affiliated_organization_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        ao['affiliated_organization_ppb_city'],
                        ao['affiliated_organization_ppb_state'],
                        ao['affiliated_organization_ppb_country']]
                    if len(p) > 0]).strip(),
            }

            if affiliated_organization_contact_ppb["value"]:
                affiliated_organization_contact_details.append(
                    affiliated_organization_contact_ppb)

            # add contact details
            for cd in affiliated_organization_contact_details:
                _affiliated_organization.add_contact_detail(**cd)

            ao_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": ao['affiliated_organization_address'],
                        },
                        {
                            "note": "city",
                            "value": ao['affiliated_organization_city'],
                        },
                        {
                            "note": "state",
                            "value": ao['affiliated_organization_state'],
                        },
                        {
                            "note": "zip",
                            "value": ao['affiliated_organization_zip'],
                        },
                        {
                            "note": "country",
                            "value": ao['affiliated_organization_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value":
                                ao['affiliated_organization_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value":
                                ao['affiliated_organization_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value":
                                ao['affiliated_organization_ppb_country'],
                        }
                    ],
                },
            ],

            _affiliated_organization.extras = combine_dicts(
                _affiliated_organization.extras, ao_extras)

        for unique_affiliated_organization in _affiliated_organizations_by_name.values():
            _affiliated_organizations.append(unique_affiliated_organization)

        # # Events & Agendas
        # name
        if parsed_form['registration_type']['new_registrant']:
            registration_type = 'New Client, New Registrant'
        elif parsed_form['registration_type']['is_amendment']:
            registration_type = 'Amended Registration'
        else:
            registration_type = 'New Client for Existing Registrant'

        # Create registration event
        _event = Event(
            name="{rn} - {rt}, {cn}".format(rn=_registrant.name,
                                            rt=registration_type,
                                            cn=_client.name),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification='registration'
        )

        # add participants
        _event.add_participant(type=_registrant._type,
                               id=_registrant._id,
                               name=_registrant.name,
                               note="registrant")

        if _registrant._type == 'person':
            _event.add_participant(type=_registrant._type,
                                   id=_registrant._id,
                                   name=_registrant.name,
                                   note="registrant")

        _event.add_participant(type=_client._type,
                               id=_client._id,
                               name=_client.name,
                               note="client")

        for l in _lobbyists:
            _event.add_participant(type=l._type,
                                   id=l._id,
                                   name=l.name,
                                   note='lobbyist')

        for fe in _foreign_entities:
            _event.add_participant(type=fe._type,
                                   id=fe._id,
                                   name=fe.name,
                                   note='foreign_entity')

        for ao in _affiliated_organizations:
            _event.add_participant(type=ao._type,
                                   id=ao._id,
                                   name=ao.name,
                                   note='affiliated_organization')

        # add agenda item
        _agenda = _event.add_agenda_item(
            description='issues lobbied on',
        )

        _agenda['notes'].append(
            parsed_form['lobbying_issues_detail']
        )

        for li in parsed_form['lobbying_issues']:
            if li['general_issue_area'] != '':
                _agenda.add_subject(li['general_issue_area'])

        _disclosure.add_disclosed_event(
            name=_event.name,
            type=_event._type,
            classification=_event.classification,
            id=_event._id
        )

        # add registrant to disclosure's _related and related_entities fields
        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _registrant.add_source(
            url=_source['url'],
            note='registrant'
        )
        yield _registrant

        if _registrant_self_employment is not None:
            _registrant_self_employment.add_source(
                url=_source['url'],
                note='registrant_self_employment'
            )

            yield _registrant_self_employment

        _client.add_source(
            url=_source['url'],
            note='client'
        )
        yield _client

        _main_contact.add_source(
            url=_source['url'],
            note='main_contact'
        )
        yield _main_contact

        for ao in _affiliated_organizations:
            ao.add_source(
                url=_source['url'],
                note='affiliated_organization'
            )
            yield ao
        for fe in _foreign_entities:
            fe.add_source(
                url=_source['url'],
                note='foreign_entity'
            )
            yield fe
        for l in _lobbyists:
            l.add_source(
                url=_source['url'],
                note='lobbyist'
            )
            yield l

        _event.add_source(**_source)
        yield _event
        _disclosure.add_source(**_source)
        yield _disclosure
Example #13
0
    def _scrape_lower_chamber(self, session):
        self.info('Scraping lower chamber for committees.')

        chamber = 'lower'

        url = '{base}ActiveCommittees.aspx'.format(base=self._reps_url_base)
        page_string = self.get(url).text
        page = lxml.html.fromstring(page_string)
        table = page.xpath('//div[@class="lightened"]/table[1]')[0]
        # Last tr has the date
        trs = table.xpath('tr')[:-1]
        for tr in trs:
            committee_parts = [
                part.strip() for part in tr.text_content().split(',')
            ]
            committee_name = committee_parts[0].title().strip()
            if len(committee_parts) > 0:
                status = committee_parts[1].strip()
            committee_url = tr.xpath('td/a')[0].attrib.get('href')
            committee_url = '{base}{url}'.format(base=self._reps_url_base,
                                                 url=committee_url)
            actual_chamber = chamber
            if 'joint' in committee_name.lower():
                actual_chamber = 'joint'

            committee_name = committee_name.replace('Committee On ', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Select', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Joint', '')
            committee_name = committee_name.replace(' Committee', '')
            committee_name = committee_name.strip()

            committee = Organization(
                committee_name,
                chamber=actual_chamber,
                classification='committee',
            )
            committee.extras = {'status': status}
            committee_page_string = self.get(committee_url).text
            committee_page = lxml.html.fromstring(committee_page_string)
            # First tr has the title (sigh)
            mem_trs = committee_page.xpath('id("memGroup")/tr')[1:]
            for mem_tr in mem_trs:
                mem_code = None
                mem_links = mem_tr.xpath('td/a[1]')
                if len(mem_links):
                    mem_code = mem_links[0].attrib.get('href')
                # Output is "Rubble, Barney, Neighbor"
                mem_parts = mem_tr.text_content().strip().split(',')
                if self._no_members_text in mem_parts:
                    continue
                mem_name = (mem_parts[1].strip() + ' ' + mem_parts[0].strip())
                # Sometimes Senator abbreviation is in the name
                mem_name = mem_name.replace('Sen. ', '')
                mem_role = 'member'
                if len(mem_parts) > 2:
                    # Handle the case where there is a comma in the
                    # role name
                    mem_role = ', '.join([p.strip()
                                          for p in mem_parts[2:]]).lower()
                membership = committee.add_member(mem_name, role=mem_role)
                membership.extras = {'code': mem_code}

            committee.add_source(url)
            committee.add_source(committee_url)

            yield committee