Python ScrapeError Exemples, pupa.scrape.base.ScrapeError Python Exemples

Exemple #1

0

Afficher le fichier

    def scrape_votes_old(self, bill, billname, session):
        vote_url = ("http://archives.legislature.state.oh.us/bills.cfm?ID=" +
                    session + "_" + billname)

        page = self.get(vote_url).text
        page = lxml.html.fromstring(page)

        for jlink in page.xpath("//a[contains(@href, 'JournalText')]"):
            date = self._tz.localize(
                datetime.datetime.strptime(jlink.text, "%m/%d/%Y")).date()
            date = "{:%Y-%m-%d}".format(date)
            details = jlink.xpath("string(../../../td[2])")

            chamber = details.split(" - ")[0]
            if chamber == "House":
                chamber = "lower"
            elif chamber == "Senate":
                chamber = "upper"
            else:
                raise ScrapeError("Bad chamber: %s" % chamber)

            motion = details.split(" - ")[1].split("\n")[0].strip()

            vote_row = jlink.xpath("../../..")[0].getnext()

            yea_div = vote_row.xpath("td/font/div[contains(@id, 'Yea')]")[0]
            yeas = []
            for td in yea_div.xpath("table/tr/td"):
                name = td.xpath("string()")
                if name:
                    yeas.append(name)

            no_div = vote_row.xpath("td/font/div[contains(@id, 'Nay')]")[0]
            nays = []
            for td in no_div.xpath("table/tr/td"):
                name = td.xpath("string()")
                if name:
                    nays.append(name)

            yes_count = len(yeas)
            no_count = len(nays)

            vote = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=motion,
                result="pass" if yes_count > no_count else "fail",
                bill=bill,
                classification="passed",
            )

            for yes in yeas:
                vote.yes(yes)
            for no in nays:
                vote.no(no)

            vote.add_source(vote_url)

            yield vote

Exemple #2

0

Afficher le fichier

Fichier : bills.py Projet : rjm328/openstates

    def scrape_bill_type(self,
                         chamber,
                         session,
                         bill_type,
                         type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower')
                    or (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime('%m/%d/%y')
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(version_name,
                                        version_url_pdf,
                                        media_type='application/pdf',
                                        date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                'Assembly': 'lower',
                                'Senate': 'upper'
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on',
                             action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime('%Y-%m-%d'),
                    chamber=actor,
                    classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(committee,
                                              entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = ('http://leginfo.legislature.ca.gov/faces'
                              '/billVotesClient.xhtml?bill_id={}').format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()

Exemple #3

0

Afficher le fichier

    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype)
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(link.xpath('string()').strip(),
                                  link.attrib['href'],
                                  media_type='text/html',
                                  on_duplicate='ignore')

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' +
            '/label[contains(text(), "Sponsors:")]' +
            '/following-sibling::div[1]/p/a')
        for link in sponsor_links:
            if link.attrib['href'].startswith(
                    'https://sdlegislature.gov/Legislators/'):
                sponsor_type = 'person'
            elif link.attrib['href'].startswith(
                    'https://sdlegislature.gov/Legislative_Session/Committees'
            ):
                sponsor_type = 'organization'
            else:
                raise ScrapeError('Found unexpected sponsor, URL: ' +
                                  link.attrib['href'])
            bill.add_sponsorship(link.text,
                                 classification='primary',
                                 primary=True,
                                 entity_type=sponsor_type)

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == '':
                self.debug(
                    'Skipping action table row that is completely empty')
                continue

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')

            if re.match(r'Signed by (?:the\s)*Governor', action,
                        re.IGNORECASE):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate', 'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match(r'\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill

Exemple #4

0

Afficher le fichier

    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h3[contains(@id, 'hdVote')])")

        if 'No Bill Action' in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(', ')[1]

        if location.startswith('House'):
            chamber = 'lower'
        elif location.startswith('Senate'):
            chamber = 'upper'
        elif location.startswith('Joint'):
            chamber = 'legislature'
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        motion = ', '.join(header.split(', ')[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//span[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//span[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//span[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//span[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification=type,
                             bill=bill)
            # The vote page URL has a unique ID
            # However, some votes are "consent calendar" events,
            # and relate to the passage of _multiple_ bills
            # These can't be modeled yet in Pupa, but for now we can
            # append a bill ID to the URL that forms the `pupa_id`
            # https://github.com/opencivicdata/pupa/issues/308
            vote.pupa_id = '{}#{}'.format(url,
                                          bill.identifier.replace(' ', ''))

            vote.add_source(url)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', excused_count)
            vote.set_count('absent', absent_count)

            for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"):
                option_or_person = td.text.strip()
                if option_or_person in ('Aye', 'Yea'):
                    vote.yes(td.getprevious().text.strip())
                elif option_or_person == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif option_or_person == 'Excused':
                    vote.vote('excused', td.getprevious().text.strip())
                elif option_or_person == 'Absent':
                    vote.vote('absent', td.getprevious().text.strip())

            yield vote

Exemple #5

0

Afficher le fichier

    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h3[contains(@id, 'hdVote')])")

        if 'No Bill Action' in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(', ')[1]

        if location.startswith('House'):
            chamber = 'lower'
        elif location.startswith('Senate'):
            chamber = 'upper'
        elif location.startswith('Joint'):
            chamber = 'legislature'
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        # committee = ' '.join(location.split(' ')[1:]).strip()
        # if not committee or committee.startswith('of Representatives'):
        #     committee = None

        motion = ', '.join(header.split(', ')[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//span[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//span[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//span[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//span[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification=type,
                             bill=bill)
            vote.pupa_id = url  # vote id is in URL

            vote.add_source(url)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', excused_count)
            vote.set_count('absent', absent_count)

            for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"):
                option_or_person = td.text.strip()
                if option_or_person in ('Aye', 'Yea'):
                    vote.yes(td.getprevious().text.strip())
                elif option_or_person == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif option_or_person == 'Excused':
                    vote.vote('excused', td.getprevious().text.strip())
                elif option_or_person == 'Absent':
                    vote.vote('absent', td.getprevious().text.strip())

            yield vote

Exemple #6

0

Afficher le fichier

Fichier : bills.py Projet : rjm328/openstates

    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if (bill_title is None or "Bill does not exist" in history_xml):
            self.warning("Bill does not appear to exist")
            return
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
                    classification=bill_type)

        bill.add_source(history_url)

        for subject in root.iterfind('subjects/subject'):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(note=self.NAME_SLUGS[version[1][-5]],
                                  url=version[1],
                                  media_type='text/html')

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(note="Analysis ({})".format(
                self.NAME_SLUGS[analysis[1][-5]]),
                                   url=analysis[1],
                                   media_type='text/html')

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(note="Fiscal Note ({})".format(
                self.NAME_SLUGS[fiscal_note[1][-5]]),
                                   url=fiscal_note[1],
                                   media_type='text/html')

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(note="Witness List ({})".format(
                self.NAME_SLUGS[witness[1][-5]]),
                                   url=witness[1],
                                   media_type='text/html')

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            action_number = action.find('actionNumber').text
            actor = {
                'H': 'lower',
                'S': 'upper',
                'E': 'executive'
            }[action_number[0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment-passage'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment-introduction'
            elif desc == 'Amendment amended':
                atype = 'amendment-amendment'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment-withdrawal'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'passage'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'introduction'
                else:
                    atype = 'filing'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'executive-receipt'
            elif desc.startswith('Signed by the Governor'):
                atype = 'executive-signature'
            elif desc.startswith('Effective on'):
                atype = 'became-law'
            elif desc == 'Vetoed by the Governor':
                atype = 'executive-veto'
            elif desc == 'Read first time':
                atype = ['introduction', 'reading-1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['passage']
                if not introduced:
                    introduced = True
                    atype.append('introduction')
            elif desc == "Passed as amended":
                atype = 'passage'
            elif (desc.startswith('Referred to')
                  or desc.startswith("Recommended to be sent to ")):
                atype = 'referral-committee'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee-passage'
            elif desc == "Filed":
                atype = 'filing'
            elif desc == 'Read 3rd time':
                atype = 'reading-3'
            elif desc == 'Read 2nd time':
                atype = 'reading-2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee-passage-favorable'
            else:
                atype = None

            act = bill.add_action(action.findtext('description'),
                                  act_date,
                                  chamber=actor,
                                  classification=atype)

            if atype and 'referral-committee' in atype:
                repls = ['Referred to', "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type='organization')

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsorship(author,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsorship(coauthor,
                                     classification='cosponsor',
                                     entity_type='person',
                                     primary=False)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsorship(sponsor,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsorship(cosponsor,
                                     classification='cosponsor',
                                     entity_type='person',
                                     primary=False)

        if root.findtext('companions'):
            self._get_companion(bill)

        yield bill

Exemple #7

0

Afficher le fichier

Fichier : bills.py Projet : fagan2888/openstates

    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r"^(S|H)B ", bill_id):
            btype = ["bill"]
        elif re.match(r"(S|H)C ", bill_id):
            btype = ["commemoration"]
        elif re.match(r"(S|H)JR ", bill_id):
            btype = ["joint resolution"]
        elif re.match(r"(S|H)CR ", bill_id):
            btype = ["concurrent resolution"]
        else:
            btype = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=btype,
        )
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={"re": regex_ns},
        )
        for link in version_links:
            bill.add_version_link(
                link.xpath("string()").strip(),
                link.attrib["href"],
                media_type="text/html",
                on_duplicate="ignore",
            )

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' +
            '/label[contains(text(), "Sponsors:")]' +
            "/following-sibling::div[1]/p/a")
        for link in sponsor_links:
            if link.attrib["href"].startswith(
                    "https://sdlegislature.gov/Legislators/"):
                sponsor_type = "person"
            elif link.attrib["href"].startswith(
                    "https://sdlegislature.gov/Legislative_Session/Committees"
            ):
                sponsor_type = "organization"
            else:
                raise ScrapeError("Found unexpected sponsor, URL: " +
                                  link.attrib["href"])
            bill.add_sponsorship(
                link.text,
                classification="primary",
                primary=True,
                entity_type=sponsor_type,
            )

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == "":
                self.debug(
                    "Skipping action table row that is completely empty")
                continue

            if "Date" in row.text_content() and "Action" in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith("First read"):
                atypes.append("introduction")
                atypes.append("reading-1")

            if re.match(r"Signed by (?:the\s)*Governor", action,
                        re.IGNORECASE):
                atypes.append("executive-signature")
                actor = "executive"

            match = re.match(r"(.*) Do Pass( Amended)?, (Passed|Failed)",
                             action)
            if match:
                if match.group(1) in ["Senate", "House of Representatives"]:
                    first = ""
                else:
                    first = "committee-"
                if match.group(3).lower() == "passed":
                    second = "passage"
                elif match.group(3).lower() == "failed":
                    second = "failure"
                atypes.append("%s%s" % (first, second))

            if "referred to" in action.lower():
                atypes.append("referral-committee")

            if "Motion to amend, Passed Amendment" in action:
                atypes.append("amendment-introduction")
                atypes.append("amendment-passage")
                amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0]
                version_name = amd.xpath("string(.)")
                version_url = amd.xpath("@href")[0]
                if "htm" in version_url:
                    mimetype = "text/html"
                elif "pdf" in version_url:
                    mimetype = "application/pdf"
                bill.add_version_link(
                    version_name,
                    version_url,
                    media_type=mimetype,
                    on_duplicate="ignore",
                )

            if "Veto override, Passed" in action:
                atypes.append("veto-override-passage")
            elif "Veto override, Failed" in action:
                atypes.append("veto-override-failure")

            if "Delivered to the Governor" in action:
                atypes.append("executive-receipt")

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == "Senate":
                    actor = "upper"
                else:
                    actor = "lower"

            date = row.xpath("string(td[1])").strip()
            match = re.match(r"\d{2}/\d{2}/\d{4}", date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib["href"])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill

Exemple #8

0

Afficher le fichier

Fichier : bills.py Projet : fagan2888/openstates

    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h3[contains(@id, 'hdVote')])")

        if "No Bill Action" in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(", ")[1]

        if location.startswith("House"):
            chamber = "lower"
        elif location.startswith("Senate"):
            chamber = "upper"
        elif location.startswith("Joint"):
            chamber = "legislature"
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        motion = ", ".join(header.split(", ")[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//span[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//span[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//span[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//span[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith("Do Pass"):
                type = "passage"
            elif motion == "Concurred in amendments":
                type = "amendment"
            elif motion == "Veto override":
                type = "veto_override"
            else:
                type = "other"

            vote = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=motion,
                result="pass" if passed else "fail",
                classification=type,
                bill=bill,
            )
            # The vote page URL has a unique ID
            # However, some votes are "consent calendar" events,
            # and relate to the passage of _multiple_ bills
            # These can't be modeled yet in Pupa, but for now we can
            # append a bill ID to the URL that forms the `pupa_id`
            # https://github.com/opencivicdata/pupa/issues/308
            vote.pupa_id = "{}#{}".format(url,
                                          bill.identifier.replace(" ", ""))

            vote.add_source(url)
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", excused_count)
            vote.set_count("absent", absent_count)

            for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"):
                option_or_person = td.text.strip()
                if option_or_person in ("Aye", "Yea"):
                    vote.yes(td.getprevious().text.strip())
                elif option_or_person == "Nay":
                    vote.no(td.getprevious().text.strip())
                elif option_or_person == "Excused":
                    vote.vote("excused", td.getprevious().text.strip())
                elif option_or_person == "Absent":
                    vote.vote("absent", td.getprevious().text.strip())

            yield vote

Exemple #9

0

Afficher le fichier

    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(
                note=self.NAME_SLUGS[version[1][-5]],
                url=version[1],
                media_type="text/html",
            )

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(
                note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]),
                url=analysis[1],
                media_type="text/html",
            )

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(
                note="Fiscal Note ({})".format(
                    self.NAME_SLUGS[fiscal_note[1][-5]]),
                url=fiscal_note[1],
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == "Amended":
                atype = "amendment-passage"
            elif desc == "Amendment(s) offered":
                atype = "amendment-introduction"
            elif desc == "Amendment amended":
                atype = "amendment-amendment"
            elif desc == "Amendment withdrawn":
                atype = "amendment-withdrawal"
            elif desc == "Passed" or desc == "Adopted":
                atype = "passage"
            elif re.match(r"^Received (by|from) the", desc):
                if "Secretary of the Senate" not in desc:
                    atype = "introduction"
                else:
                    atype = "filing"
            elif desc.startswith("Sent to the Governor"):
                # But what if it gets lost in the mail?
                atype = "executive-receipt"
            elif desc.startswith("Signed by the Governor"):
                atype = "executive-signature"
            elif desc.startswith("Effective on"):
                atype = "became-law"
            elif desc == "Vetoed by the Governor":
                atype = "executive-veto"
            elif desc == "Read first time":
                atype = ["introduction", "reading-1"]
                introduced = True
            elif desc == "Read & adopted":
                atype = ["passage"]
                if not introduced:
                    introduced = True
                    atype.append("introduction")
            elif desc == "Passed as amended":
                atype = "passage"
            elif desc.startswith("Referred to") or desc.startswith(
                    "Recommended to be sent to "):
                atype = "referral-committee"
            elif desc == "Reported favorably w/o amendment(s)":
                atype = "committee-passage"
            elif desc == "Filed":
                atype = "filing"
            elif desc == "Read 3rd time":
                atype = "reading-3"
            elif desc == "Read 2nd time":
                atype = "reading-2"
            elif desc.startswith("Reported favorably"):
                atype = "committee-passage-favorable"
            else:
                atype = None

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill

Exemple #10

0

Afficher le fichier

    def scrape_bill_type(
            self,
            chamber,
            session,
            bill_type,
            type_abbr,
            committee_abbr_regex=get_committee_name_regex(),
    ):
        bills = (self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr))

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title="", chamber=chamber)
            if (bill_id.startswith("S")
                    and chamber == "lower") or (bill_id.startswith("A")
                                                and chamber == "upper"):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ("http://leginfo.legislature.ca.gov/faces/"
                          "billNavClient.xhtml?bill_id=%s") % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type="text/html")

            title = ""
            type_ = ["bill"]
            subject = ""
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = "//caml:DigestText/xhtml:p"
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r"\s+", " ", t)
                    t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t)
                    chunks.append(t)
                summary = "\n\n".join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime("%m/%d/%y")
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type="application/pdf",
                    date=version_date.date(),
                )

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ("AB", "SB"):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(
                            version.short_title) and not version.title.lower(
                            ).startswith("an act"):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == "Yes":
                    type_.append("appropriation")

                tags = []
                if version.fiscal_committee == "Yes":
                    tags.append("fiscal committee")
                if version.local_program == "Yes":
                    tags.append("local program")
                if version.urgency == "Yes":
                    tags.append("urgency")
                if version.taxlevy == "Yes":
                    tags.append("tax levy")

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note="summary")
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras["impact_clause"] = impact_clause
            fsbill.extras["tags"] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == "Y",
                    entity_type="person",
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {
                        "Assembly": "lower",
                        "Senate": "upper"
                    }[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                "Assembly": "lower",
                                "Senate": "upper"
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r"^(Assembly|Senate)", replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r"\s+", " ", act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r"Com[s]?. on",
                             action.action) and not matched_abbrs:
                    msg = "Failed to extract committee abbr from %r."
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ("Mapping contains no committee name for "
                                   "abbreviation %r. Action text was %r.")
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs["committees"] = committees

                    code = re.search(r"C[SXZ]\d+", actor)
                    if code is not None:
                        code = code.group()
                        kwargs["actor_info"] = {"committee_code": code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace("Coms. on ", "")
                        act_str = act_str.replace("Com. on " + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith("."):
                            act_str = act_str + "."

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ["upper", "lower", "legislature"]:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = "legislature"

                if actor != action.actor:
                    actor_info = kwargs.get("actor_info", {})
                    actor_info["details"] = action.actor
                    kwargs["actor_info"] = actor_info

                # Add strings for related legislators, if any.
                rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+"
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs["legislators"] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime("%Y-%m-%d"),
                    chamber=actor,
                    classification=kwargs["classification"],
                )
                for committee in kwargs.get("committees", []):
                    action.add_related_entity(committee,
                                              entity_type="organization")
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == "(PASS)":
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(" ")[0].lower()
                if first_part in ["asm", "assembly"]:
                    vote_chamber = "lower"
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith("sen"):
                    vote_chamber = "upper"
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ""
                else:
                    motion = ""

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = "passage"
                elif "Do Pass" in motion:
                    vtype = "passage"
                else:
                    vtype = "other"

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r"(\w+)( Extraordinary)? Session$",
                                    re.IGNORECASE).sub("", motion)
                motion = re.compile(r"^(Senate|Assembly) ",
                                    re.IGNORECASE).sub("", motion)
                motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ", "",
                                motion)
                motion = re.sub(r" \(\w+\)$", "", motion)
                motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "",
                                motion)
                motion = re.sub(
                    r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? "
                    r"Urgency Clause$",
                    "(Urgency Clause)",
                    motion,
                )
                motion = re.sub(r"\s+", " ", motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result="pass" if result else "fail",
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {"threshold": vote.threshold}

                source_url = ("http://leginfo.legislature.ca.gov/faces"
                              "/billVotesClient.xhtml?bill_id={}").format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + "#" + str(vote_num)

                rc = {"yes": [], "no": [], "other": []}
                for record in vote.votes:
                    if record.vote_code == "AYE":
                        rc["yes"].append(record.legislator_name)
                    elif record.vote_code.startswith("NO"):
                        rc["no"].append(record.legislator_name)
                    else:
                        rc["other"].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()