Python Organization.extras Examples

Programming Language: Python

Namespace/Package Name: pupa.scrape

Class/Type: Organization

Method/Function: extras

Examples at hotexamples.com: 13

Python Organization.extras - 13 examples found. These are the top rated real world Python examples of pupa.scrape.Organization.extras extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_post(30)

add_member(30)

add_source(30)

as_dict(18)

add_link(13)

add_contact_detail(10)

add_name(10)

add_identifier(8)

Organization(7)

extras(7)

validate(3)

founding_date(2)

source_identified(2)

dissolution_date(1)

Example #1

Show file

File: committees.py Project: anukat2015/scrapers-ca

    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'], classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME)
                extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o

Example #2

Show file

    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(
            self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'],
                                     classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'],
                                     scheme=TWO_LETTER_ORG_CODE_SCHEME)
                extras['tmmis_decision_body_ids'].append(
                    {inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o

Example #3

Show file

def test_extras_organization():
    org = ScrapeOrganization('United Nations', classification='international')
    org.extras = {"hello": "world", "foo": {"bar": "baz"}}
    od = org.as_dict()
    OrganizationImporter('jurisdiction-id').import_data([od])
    o = Organization.objects.get()
    assert o.extras['foo']['bar'] == 'baz'

Example #4

Show file

File: test_organization_importer.py Project: anukat2015/pupa

def test_extras_organization():
    org = ScrapeOrganization('United Nations', classification='international')
    org.extras = {"hello": "world",
                  "foo": {"bar": "baz"}}
    od = org.as_dict()
    OrganizationImporter('jurisdiction-id').import_data([od])
    o = Organization.objects.get()
    assert o.extras['foo']['bar'] == 'baz'

Example #5

Show file

    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(
            self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            # When there are no meetings scheduled and was no way to deduce committee code.
            if not code:
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'],
                                     classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'],
                                     scheme=TWO_LETTER_ORG_CODE_SCHEME)

                    # TODO: Scrape non-councillor members
                    meeting_id = self.referenceMeetingId(
                        inst['code'], inst['term'])
                    if meeting_id:
                        seen_posts = []
                        membership_url = MEMBERSHIP_URL_TEMPLATE.format(
                            meeting_id)
                        for councillor in self.councillorMembers(
                                membership_url):
                            o.add_member(councillor['name'],
                                         councillor['role'])
                            if councillor['role'] not in seen_posts:
                                # TODO: More specific divisions for some committee?
                                o.add_post(
                                    role=councillor['role'],
                                    label=councillor['role'],
                                    division_id=self.jurisdiction.division_id)
                                seen_posts.append(councillor['role'])

                extras['tmmis_decision_body_ids'].append(
                    {inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o

Example #6

Show file

File: committees.py Project: ppival/scrapers-ca

    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            # When there are no meetings scheduled and was no way to deduce committee code.
            if not code:
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'], classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME)

                    # TODO: Scrape non-councillor members
                    meeting_id = self.referenceMeetingId(inst['code'], inst['term'])
                    if meeting_id:
                        seen_posts = []
                        membership_url = MEMBERSHIP_URL_TEMPLATE.format(meeting_id)
                        for councillor in self.councillorMembers(membership_url):
                            o.add_member(councillor['name'], councillor['role'])
                            if councillor['role'] not in seen_posts:
                                o.add_post(
                                    role=councillor['role'],
                                    label=councillor['role'],
                                    # TODO: More specific divisions for some committee?
                                    division_id=self.jurisdiction.division_id,
                                )
                                seen_posts.append(councillor['role'])

                extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o

Example #7

Show file

    def scrape_committee(self, comm_num):
        url = self.committee_url(comm_num)
        page = self.lxmlize(url)
        # get title
        comm_name = page.xpath("//h1/text()")[0]

        # create object
        comm = Organization(name=comm_name,
                            classification="committee",
                            chamber="legislature")
        comm.add_source(url=url)

        # add posts
        comm.add_post(label="chair", role="chair")
        # FIXME do we need a separate post for each member?
        # FIXME is member an appropriate name?
        comm.add_post(label="member", role="member")

        # helper for finding other nodes
        landmark_node = page.xpath("//h2[text()='Committee Members']")[0]

        # add memberships
        member_names = landmark_node.xpath(
            "following-sibling::div/ul/li/a/text()")
        fl_names = [HumanName.name_firstandlast(name) for name in member_names]
        print("My attempt to scrub people's names:",
              list(zip(member_names, fl_names)))
        chair_name, *other_names = fl_names
        if chair_name not in {'Lewis Reed'}:
            comm.add_member(chair_name, role="chair")
        for name in other_names:
            if name not in {'Lewis Reed'}:
                comm.add_member(name, role="member")

    # add description
        about_node = page.xpath("//h2[text()='About']")[0]
        (description, ) = about_node.xpath(
            "parent::div//div[@class='content-block']/p[2]/text()")
        description = description.strip()
        comm.extras = {"description": description}

        yield comm

Example #8

Show file

File: people.py Project: datamade/scrapers-us-municipal

	def scrape_committee(self, comm_num):
                url = self.committee_url(comm_num)
                page = self.lxmlize(url)
                # get title
                comm_name = page.xpath("//h1/text()")[0]

                # create object
                comm = Organization(name=comm_name,
                                    classification="committee",
                                    chamber="legislature")
                comm.add_source(url=url)

		# add posts
                comm.add_post(label="chair", role="chair")
		# FIXME do we need a separate post for each member?
		# FIXME is member an appropriate name?
                comm.add_post(label="member", role="member") 

		# helper for finding other nodes
                landmark_node = page.xpath("//h2[text()='Committee Members']")[0]

		# add memberships
                member_names = landmark_node.xpath("following-sibling::div/ul/li/a/text()")
                fl_names = [HumanName.name_firstandlast(name) for name in member_names]
                print("My attempt to scrub people's names:", 
                      list(zip(member_names, fl_names)))
                chair_name, *other_names = fl_names
                if chair_name not in {'Lewis Reed'} :
                        comm.add_member(chair_name, role="chair")
                for name in other_names:
                        if name not in {'Lewis Reed'} :
                                comm.add_member(name, role="member")
		# add description 
                about_node = page.xpath("//h2[text()='About']")[0]
                (description, ) = about_node.xpath("parent::div//div[@class='content-block']/p[2]/text()")
                description = description.strip()
                comm.extras = {"description": description}

                yield comm

Example #9

Show file

File: committees.py Project: neelneelpurk/openstates

    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        committees = backoff(self.cservice.GetCommitteesBySession, sid)

        # if committees.strip() == "":
        #    return  # If we get here, it's a problem.
        # Commenting this out for future debugging. - PRT

        if str(committees).strip() == "":
            raise ValueError("Error: No committee data for sid: %s" % (sid))

        committees = committees['CommitteeListing']
        for committee in committees:
            cid = committee['Id']
            committee = backoff(self.cservice.GetCommittee, cid)
            subctty_cache = {}

            comname, typ, guid, code, description = [committee[x] for x in [
                'Name', 'Type', 'Id', 'Code', 'Description'
            ]]
            comchamber = {
                "House": "lower",
                "Senate": "upper",
                "Joint": "joint"
            }[typ]
            ctty_key = '{}-{}'.format(typ, code)
            if ctty_key not in self.ctty_cache:
                ctty = Organization(chamber=comchamber, name=comname, classification='committee')
                ctty.extras = {
                    'code': code,
                    'guid': guid,
                    'description': description,
                }
                self.ctty_cache[ctty_key] = ctty

            members = committee['Members']['CommitteeMember']
            for member in members:
                name = "{First} {Last}".format(**dict(member['Member']['Name']))
                role = member['Role']
                membership = ctty.add_member(name, role)
                membership.extras = {'guid': member['Member']['Id']}
                subcoms = member['SubCommittees'] or []
                for subcom in subcoms:
                    subcom = subcom[1][0]
                    subguid = subcom['Id']
                    subcommittee = subcom['Name']
                    if subcommittee in subctty_cache:
                        # Add member to existing subcommittee.
                        subctty = subctty_cache[subcommittee]
                    else:
                        # Create subcommittee.
                        subctty = Organization(
                            name=subcommittee, classification='committee',
                            parent_id={'classification': comchamber, 'name': comname})
                        subctty.extras = {
                            'guid': subguid,
                        }
                        subctty.add_source(self.csource)
                        subctty.add_source(CTTIE_URL.format(**{
                            "sid": sid,
                            "cttie": guid,
                        }))
                        subctty_cache[subcommittee] = subctty
                    membership = subctty.add_member(name, role)
                    membership.extras = {'guid': member['Member']['Id']}

            for subctty in subctty_cache.values():
                yield subctty

            ctty.add_source(self.csource)
            ctty.add_source(CTTIE_URL.format(**{
                "sid": sid,
                "cttie": guid,
            }))
            yield ctty

Example #10

Show file

File: committees.py Project: cliftonmcintosh/openstates

    def _scrape_lower_chamber(self, session):
        self.info('Scraping lower chamber for committees.')

        chamber = 'lower'

        url = '{base}ActiveCommittees.aspx'.format(base=self._reps_url_base)
        page_string = self.get(url).text
        page = lxml.html.fromstring(page_string)
        table = page.xpath('//div[@class="lightened"]/table[1]')[0]
        # Last tr has the date
        trs = table.xpath('tr')[:-1]
        for tr in trs:
            committee_parts = [part.strip()
                               for part in tr.text_content().split(',')]
            committee_name = committee_parts[0].title().strip()
            if len(committee_parts) > 0:
                status = committee_parts[1].strip()
            committee_url = tr.xpath('td/a')[0].attrib.get('href')
            committee_url = '{base}{url}'.format(base=self._reps_url_base,
                                                 url=committee_url)
            actual_chamber = chamber
            if 'joint' in committee_name.lower():
                actual_chamber = 'joint'

            committee_name = committee_name.replace('Committee On ', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Select', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Joint', '')
            committee_name = committee_name.replace(' Committee', '')
            committee_name = committee_name.strip()

            committee = Organization(
                committee_name,
                chamber=actual_chamber,
                classification='committee',
            )
            committee.extras = {'status': status}
            committee_page_string = self.get(committee_url).text
            committee_page = lxml.html.fromstring(committee_page_string)
            # First tr has the title (sigh)
            mem_trs = committee_page.xpath('id("memGroup")/tr')[1:]
            for mem_tr in mem_trs:
                mem_code = None
                mem_links = mem_tr.xpath('td/a[1]')
                if len(mem_links):
                    mem_code = mem_links[0].attrib.get('href')
                # Output is "Rubble, Barney, Neighbor"
                mem_parts = mem_tr.text_content().strip().split(',')
                if self._no_members_text in mem_parts:
                    continue
                mem_name = (mem_parts[1].strip() + ' ' +
                            mem_parts[0].strip())
                # Sometimes Senator abbreviation is in the name
                mem_name = mem_name.replace('Sen. ', '')
                mem_role = 'member'
                if len(mem_parts) > 2:
                    # Handle the case where there is a comma in the
                    # role name
                    mem_role = ', '.join(
                        [p.strip() for p in mem_parts[2:]]).lower()
                membership = committee.add_member(mem_name, role=mem_role)
                membership.extras = {'code': mem_code}

            committee.add_source(url)
            committee.add_source(committee_url)

            yield committee

Example #11

Show file

    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        committees = backoff(self.cservice.GetCommitteesBySession, sid)

        # if committees.strip() == "":
        #    return  # If we get here, it's a problem.
        # Commenting this out for future debugging. - PRT

        if str(committees).strip() == "":
            raise ValueError("Error: No committee data for sid: %s" % (sid))

        committees = committees['CommitteeListing']
        for committee in committees:
            cid = committee['Id']
            committee = backoff(self.cservice.GetCommittee, cid)
            subctty_cache = {}

            comname, typ, guid, code, description = [
                committee[x]
                for x in ['Name', 'Type', 'Id', 'Code', 'Description']
            ]
            comchamber = {
                "House": "lower",
                "Senate": "upper",
                "Joint": "joint"
            }[typ]
            ctty_key = '{}-{}'.format(typ, code)
            if ctty_key not in self.ctty_cache:
                ctty = Organization(chamber=comchamber,
                                    name=comname,
                                    classification='committee')
                ctty.extras = {
                    'code': code,
                    'guid': guid,
                    'description': description,
                }
                self.ctty_cache[ctty_key] = ctty

            members = committee['Members']['CommitteeMember']
            for member in members:
                name = "{First} {Last}".format(
                    **dict(member['Member']['Name']))
                role = member['Role']
                membership = ctty.add_member(name, role)
                membership.extras = {'guid': member['Member']['Id']}
                subcoms = member['SubCommittees'] or []
                for subcom in subcoms:
                    subcom = subcom[1][0]
                    subguid = subcom['Id']
                    subcommittee = subcom['Name']
                    if subcommittee in subctty_cache:
                        # Add member to existing subcommittee.
                        subctty = subctty_cache[subcommittee]
                    else:
                        # Create subcommittee.
                        subctty = Organization(name=subcommittee,
                                               classification='committee',
                                               parent_id={
                                                   'classification':
                                                   comchamber,
                                                   'name': comname
                                               })
                        subctty.extras = {
                            'guid': subguid,
                        }
                        subctty.add_source(self.csource)
                        subctty.add_source(
                            CTTIE_URL.format(**{
                                "sid": sid,
                                "cttie": guid,
                            }))
                        subctty_cache[subcommittee] = subctty
                    membership = subctty.add_member(name, role)
                    membership.extras = {'guid': member['Member']['Id']}

            for subctty in subctty_cache.values():
                yield subctty

            ctty.add_source(self.csource)
            ctty.add_source(CTTIE_URL.format(**{
                "sid": sid,
                "cttie": guid,
            }))
            yield ctty

Example #12

Show file

    def transform_parse(self, parsed_form, response):

        _source = {
            "url": response.url,
            "note": "LDA Form LD-1"
        }

        # basic disclosure fields
        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['datetimes']['signature_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification="lobbying"
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.add_identifier(
            identifier=parsed_form['_meta']['document_id'],
            scheme="urn:sopr:filing"
        )

        # disclosure extras
        _disclosure.extras = {}
        _disclosure.extras['registrant'] = {
            'self_employed_individual': parsed_form['registrant']['self_employed_individual'],
            'general_description': parsed_form['registrant']['registrant_general_description'],
            'signature': {
                "signature_date": parsed_form['datetimes']['signature_date'],
                "signature": parsed_form['signature']
            }
        }

        _disclosure.extras['client'] = {
            'same_as_registrant':
                parsed_form['client']['client_self'],
            'general_description':
                parsed_form['client']['client_general_description']
        }

        _disclosure.extras['registration_type'] = {
            'is_amendment':
                parsed_form['registration_type']['is_amendment'],
            'new_registrant':
                parsed_form['registration_type']['new_registrant'],
            'new_client_for_existing_registrant':
                parsed_form['registration_type'][
                    'new_client_for_existing_registrant'],
        }

        # # Registrant
        # build registrant
        _registrant_self_employment = None

        if parsed_form['registrant']['self_employed_individual']:
            n = ' '.join([p for p in [
                parsed_form['registrant']['registrant_individual_prefix'],
                parsed_form['registrant']['registrant_individual_firstname'],
                parsed_form['registrant']['registrant_individual_lastname']
            ] if len(p) > 0]).strip()

            _registrant = Person(
                name=n,
                source_identified=True
            )

            _registrant_self_employment = Organization(
                name='SELF-EMPLOYMENT of {n}'.format(n=n),
                classification='company',
                source_identified=True
            )

            _registrant.add_membership(
                organization=_registrant_self_employment,
                role='self_employed',
                label='self-employment of {n}'.format(n=n),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant = Organization(
                name=parsed_form['registrant']['registrant_org_name'],
                classification='company',
                source_identified=True
            )

        if len(parsed_form['registrant']['registrant_house_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_house_id'],
                scheme='urn:house_clerk:registrant'
            )

        if len(parsed_form['registrant']['registrant_senate_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_senate_id'],
                scheme='urn:sopr:registrant'
            )

        registrant_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['registrant']['registrant_address_one'],
                        parsed_form['registrant']['registrant_address_two'],
                        parsed_form['registrant']['registrant_city'],
                        parsed_form['registrant']['registrant_state'],
                        parsed_form['registrant']['registrant_zip'],
                        parsed_form['registrant']['registrant_country']]
                    if len(p) > 0]).strip(),
            },
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            },
        ]

        registrant_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['registrant']['registrant_ppb_city'],
                    parsed_form['registrant']['registrant_ppb_state'],
                    parsed_form['registrant']['registrant_ppb_zip'],
                    parsed_form['registrant']['registrant_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if registrant_contact_ppb["value"]:
            registrant_contact_details.append(registrant_contact_ppb)

        for cd in registrant_contact_details:
            _registrant.add_contact_detail(**cd)

        _registrant.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address_one",
                            "value": parsed_form['registrant'][
                                'registrant_address_one'],
                        },
                        {
                            "note": "address_two",
                            "value": parsed_form['registrant'][
                                'registrant_address_two'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_country'],
                        }
                    ],
                },
            ]
        }

        # # People
        # build contact
        _main_contact = Person(
            name=parsed_form['registrant']['registrant_contact_name'],
            source_identified=True
        )

        main_contact_contact_details = [
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            }
        ]

        for cd in main_contact_contact_details:
            _main_contact.add_contact_detail(**cd)

        if _registrant._type == 'organization':
            _registrant.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant_self_employment.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )

        # # Client
        # build client
        _client = Organization(
            name=parsed_form['client']['client_name'],
            classification='company',
            source_identified=True
        )

        client_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['client']['client_address'],
                        parsed_form['client']['client_city'],
                        parsed_form['client']['client_state'],
                        parsed_form['client']['client_zip'],
                        parsed_form['client']['client_country']]
                    if len(p) > 0]).strip(),
            },
        ]

        client_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['client']['client_ppb_city'],
                    parsed_form['client']['client_ppb_state'],
                    parsed_form['client']['client_ppb_zip'],
                    parsed_form['client']['client_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if client_contact_ppb["value"]:
            client_contact_details.append(client_contact_ppb)

        for cd in client_contact_details:
            _client.add_contact_detail(**cd)

        _client.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": parsed_form['client']['client_address'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client']['client_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client'][
                                'client_ppb_country'],
                        }
                    ],
                },
            ],
        }

        # Collect Foreign Entities
        _foreign_entities = []
        _foreign_entities_by_name = {}
        for fe in parsed_form['foreign_entities']:
            fe_extras = {}
            fe_name = fe['foreign_entity_name']

            # check for name-based duplicates
            if fe_name in _foreign_entities_by_name:
                _foreign_entity = _foreign_entities_by_name[fe_name]
            else:
                _foreign_entity = Organization(
                    name=fe_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            foreign_entity_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_address'],
                            fe['foreign_entity_city'],
                            fe['foreign_entity_state'],
                            fe['foreign_entity_country']]
                        if len(p) > 0]).strip(),
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_ppb_state'],
                            fe['foreign_entity_ppb_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            foreign_entity_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        fe['foreign_entity_ppb_city'],
                        fe['foreign_entity_ppb_state'],
                        fe['foreign_entity_ppb_country']]
                    if len(p) > 0]),
            }

            if foreign_entity_contact_ppb["value"]:
                foreign_entity_contact_details.append(
                    foreign_entity_contact_ppb)

            # add contact details
            for cd in foreign_entity_contact_details:
                if cd['value'] != '':
                    _foreign_entity.add_contact_detail(**cd)

            # add extras
            fe_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": fe['foreign_entity_address'],
                        },
                        {
                            "note": "city",
                            "value": fe['foreign_entity_city'],
                        },
                        {
                            "note": "state",
                            "value": fe['foreign_entity_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "state",
                            "value": fe['foreign_entity_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_ppb_country'],
                        }
                    ],
                },
            ]

            _foreign_entity.extras = combine_dicts(_foreign_entity.extras,
                                                   fe_extras)

            _foreign_entities_by_name[fe_name] = _foreign_entity

        for unique_foreign_entity in _foreign_entities_by_name.values():
            _foreign_entities.append(unique_foreign_entity)

            # TODO: add a variant on memberships to represent inter-org
            # relationships (associations, ownership, etc)
            #
            # _client['memberships'].append({
            #     "id": _foreign_entity['id'],
            #     "classification": "organization",
            #     "name": _foreign_entity['name'],
            #     "extras": {
            #         "ownership_percentage":
            #             fe['foreign_entity_amount']
            #     }
            # })

        # Collect Lobbyists
        # TODO: deal with wierd non-name line continuation cases (blanks, "continued")
        _lobbyists_by_name = {}

        for l in parsed_form['lobbyists']:
            l_extras = {}
            l_name = ' '.join([l['lobbyist_first_name'],
                               l['lobbyist_last_name'],
                               l['lobbyist_suffix']
                               ]).strip()

            if l_name in _lobbyists_by_name:
                _lobbyist = _lobbyists_by_name[l_name]
            else:
                _lobbyist = Person(
                    name=l_name,
                    source_identified=True
                )

            if l['lobbyist_covered_official_position']:
                l_extras['lda_covered_official_positions'] = [
                    {
                        'date_reported':
                            parsed_form['datetimes']['effective_date'],
                        'covered_official_position':
                            l['lobbyist_covered_official_position']
                    },
                ]

            _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras)

            _lobbyists_by_name[l_name] = _lobbyist

        _lobbyists = []
        for unique_lobbyist in _lobbyists_by_name.values():
            _lobbyists.append(unique_lobbyist)

        if _registrant._type == 'organization':
            for l in _lobbyists:
                _registrant.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )
        else:
            for l in _lobbyists:
                _registrant_self_employment.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )

        # # Document
        # build document
        _disclosure.add_document(
            note='submitted filing',
            date=parsed_form['datetimes']['effective_date'][:10],
            url=response.url
        )

        # Collect Affiliated orgs
        _affiliated_organizations = []
        _affiliated_organizations_by_name = {}
        for ao in parsed_form['affiliated_organizations']:
            ao_extras = {}
            ao_name = ao['affiliated_organization_name']
            if ao_name in _affiliated_organizations_by_name:
                # There's already one by this name
                _affiliated_organization = _affiliated_organizations_by_name[ao_name]
            else:
                # New affiliated org
                _affiliated_organization = Organization(
                    name=ao_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            affiliated_organization_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            ao['affiliated_organization_address'],
                            ao['affiliated_organization_city'],
                            ao['affiliated_organization_state'],
                            ao['affiliated_organization_zip'],
                            ao['affiliated_organization_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            affiliated_organization_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        ao['affiliated_organization_ppb_city'],
                        ao['affiliated_organization_ppb_state'],
                        ao['affiliated_organization_ppb_country']]
                    if len(p) > 0]).strip(),
            }

            if affiliated_organization_contact_ppb["value"]:
                affiliated_organization_contact_details.append(
                    affiliated_organization_contact_ppb)

            # add contact details
            for cd in affiliated_organization_contact_details:
                _affiliated_organization.add_contact_detail(**cd)

            ao_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": ao['affiliated_organization_address'],
                        },
                        {
                            "note": "city",
                            "value": ao['affiliated_organization_city'],
                        },
                        {
                            "note": "state",
                            "value": ao['affiliated_organization_state'],
                        },
                        {
                            "note": "zip",
                            "value": ao['affiliated_organization_zip'],
                        },
                        {
                            "note": "country",
                            "value": ao['affiliated_organization_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value":
                                ao['affiliated_organization_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value":
                                ao['affiliated_organization_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value":
                                ao['affiliated_organization_ppb_country'],
                        }
                    ],
                },
            ],

            _affiliated_organization.extras = combine_dicts(
                _affiliated_organization.extras, ao_extras)

        for unique_affiliated_organization in _affiliated_organizations_by_name.values():
            _affiliated_organizations.append(unique_affiliated_organization)

        # # Events & Agendas
        # name
        if parsed_form['registration_type']['new_registrant']:
            registration_type = 'New Client, New Registrant'
        elif parsed_form['registration_type']['is_amendment']:
            registration_type = 'Amended Registration'
        else:
            registration_type = 'New Client for Existing Registrant'

        # Create registration event
        _event = Event(
            name="{rn} - {rt}, {cn}".format(rn=_registrant.name,
                                            rt=registration_type,
                                            cn=_client.name),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification='registration'
        )

        # add participants
        _event.add_participant(type=_registrant._type,
                               id=_registrant._id,
                               name=_registrant.name,
                               note="registrant")

        if _registrant._type == 'person':
            _event.add_participant(type=_registrant._type,
                                   id=_registrant._id,
                                   name=_registrant.name,
                                   note="registrant")

        _event.add_participant(type=_client._type,
                               id=_client._id,
                               name=_client.name,
                               note="client")

        for l in _lobbyists:
            _event.add_participant(type=l._type,
                                   id=l._id,
                                   name=l.name,
                                   note='lobbyist')

        for fe in _foreign_entities:
            _event.add_participant(type=fe._type,
                                   id=fe._id,
                                   name=fe.name,
                                   note='foreign_entity')

        for ao in _affiliated_organizations:
            _event.add_participant(type=ao._type,
                                   id=ao._id,
                                   name=ao.name,
                                   note='affiliated_organization')

        # add agenda item
        _agenda = _event.add_agenda_item(
            description='issues lobbied on',
        )

        _agenda['notes'].append(
            parsed_form['lobbying_issues_detail']
        )

        for li in parsed_form['lobbying_issues']:
            if li['general_issue_area'] != '':
                _agenda.add_subject(li['general_issue_area'])

        _disclosure.add_disclosed_event(
            name=_event.name,
            type=_event._type,
            classification=_event.classification,
            id=_event._id
        )

        # add registrant to disclosure's _related and related_entities fields
        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _registrant.add_source(
            url=_source['url'],
            note='registrant'
        )
        yield _registrant

        if _registrant_self_employment is not None:
            _registrant_self_employment.add_source(
                url=_source['url'],
                note='registrant_self_employment'
            )

            yield _registrant_self_employment

        _client.add_source(
            url=_source['url'],
            note='client'
        )
        yield _client

        _main_contact.add_source(
            url=_source['url'],
            note='main_contact'
        )
        yield _main_contact

        for ao in _affiliated_organizations:
            ao.add_source(
                url=_source['url'],
                note='affiliated_organization'
            )
            yield ao
        for fe in _foreign_entities:
            fe.add_source(
                url=_source['url'],
                note='foreign_entity'
            )
            yield fe
        for l in _lobbyists:
            l.add_source(
                url=_source['url'],
                note='lobbyist'
            )
            yield l

        _event.add_source(**_source)
        yield _event
        _disclosure.add_source(**_source)
        yield _disclosure

Example #13

Show file

File: committees.py Project: llenroc/openstates

    def _scrape_lower_chamber(self, session):
        self.info('Scraping lower chamber for committees.')

        chamber = 'lower'

        url = '{base}ActiveCommittees.aspx'.format(base=self._reps_url_base)
        page_string = self.get(url).text
        page = lxml.html.fromstring(page_string)
        table = page.xpath('//div[@class="lightened"]/table[1]')[0]
        # Last tr has the date
        trs = table.xpath('tr')[:-1]
        for tr in trs:
            committee_parts = [
                part.strip() for part in tr.text_content().split(',')
            ]
            committee_name = committee_parts[0].title().strip()
            if len(committee_parts) > 0:
                status = committee_parts[1].strip()
            committee_url = tr.xpath('td/a')[0].attrib.get('href')
            committee_url = '{base}{url}'.format(base=self._reps_url_base,
                                                 url=committee_url)
            actual_chamber = chamber
            if 'joint' in committee_name.lower():
                actual_chamber = 'joint'

            committee_name = committee_name.replace('Committee On ', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Select', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Joint', '')
            committee_name = committee_name.replace(' Committee', '')
            committee_name = committee_name.strip()

            committee = Organization(
                committee_name,
                chamber=actual_chamber,
                classification='committee',
            )
            committee.extras = {'status': status}
            committee_page_string = self.get(committee_url).text
            committee_page = lxml.html.fromstring(committee_page_string)
            # First tr has the title (sigh)
            mem_trs = committee_page.xpath('id("memGroup")/tr')[1:]
            for mem_tr in mem_trs:
                mem_code = None
                mem_links = mem_tr.xpath('td/a[1]')
                if len(mem_links):
                    mem_code = mem_links[0].attrib.get('href')
                # Output is "Rubble, Barney, Neighbor"
                mem_parts = mem_tr.text_content().strip().split(',')
                if self._no_members_text in mem_parts:
                    continue
                mem_name = (mem_parts[1].strip() + ' ' + mem_parts[0].strip())
                # Sometimes Senator abbreviation is in the name
                mem_name = mem_name.replace('Sen. ', '')
                mem_role = 'member'
                if len(mem_parts) > 2:
                    # Handle the case where there is a comma in the
                    # role name
                    mem_role = ', '.join([p.strip()
                                          for p in mem_parts[2:]]).lower()
                membership = committee.add_member(mem_name, role=mem_role)
                membership.extras = {'code': mem_code}

            committee.add_source(url)
            committee.add_source(committee_url)

            yield committee