Ejemplo n.º 1
0
def record_votes(root, session, chamber):
    for el in root.xpath('//div{}'.format(''.join(vote_selectors))):
        mv = MaybeVote(el)
        if not mv.is_valid:
            continue

        v = VoteEvent(
            chamber=chamber,
            start_date=None,
            motion_text='passage' if mv.passed else 'other',
            result='pass' if mv.passed else 'fail',
            classification='passage' if mv.passed else 'other',
            legislative_session=session[0:2],
            bill=mv.bill_id,
            bill_chamber=mv.chamber
        )

        v.set_count('yes', mv.yeas or 0)
        v.set_count('no', mv.nays or 0)
        v.set_count('not voting', mv.present or 0)

        for each in mv.votes['yeas']:
            v.yes(each)
        for each in mv.votes['nays']:
            v.no(each)
        for each in mv.votes['present']:
            v.vote('not voting', each)
        for each in mv.votes['absent']:
            v.vote('absent', each)

        yield v
Ejemplo n.º 2
0
    def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid):
        votes = page.xpath("//table")[0]
        rows = votes.xpath(".//tr")[0]
        if rows[0].text_content() == 'Votes:':
            # New webste
            rows = votes.xpath(".//tr")[2]
        yno = rows.xpath(".//td")
        if len(yno) < 3:
            yes = yno[0]
            no, other = None, None
        else:
            yes, no, other = rows.xpath(".//td")[:3]

        def proc_block(obj, typ):
            if obj is None:
                return {
                    "type": None,
                    "count": None,
                    "votes": []
                }
            votes = []
            for vote in obj.xpath(".//br"):
                if vote.tail:
                    vote = vote.tail.strip()
                    if vote:
                        votes.append(vote)
            count = len(votes)
            return {
                "type": typ,
                "count": count,
                "votes": votes
            }

        vote_dict = {
            "yes": proc_block(yes, 'yes'),
            "no": proc_block(no, 'no'),
            "other": proc_block(other, 'other'),
        }

        yes_count = vote_dict['yes']['count']
        no_count = vote_dict['no']['count'] or 0
        other_count = vote_dict['other']['count'] or 0
        print(motion)
        vote = Vote(chamber=actor,
                    start_date=date,
                    motion_text=motion,
                    identifier=str(uniqid),
                    result='pass' if (yes_count > no_count) else 'fail',
                    classification='passage',
                    bill=bill)
        vote.extras = {'_vote_id': uniqid}
        vote.add_source(url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)
        for key in vote_dict:
            for voter in vote_dict[key]['votes']:
                vote.vote(key, voter)

        yield vote
Ejemplo n.º 3
0
def build_vote(session, bill_id, url, vote_record, chamber, motion_text):
    # When they vote in a substitute they mark it as XHB
    bill_id = bill_id.replace('XHB', 'HB')
    passed = len(vote_record['yes']) > len(vote_record['no'])
    vote_event = VoteEvent(
        result='pass' if passed else 'fail',
        chamber=chamber,
        start_date=vote_record['date'].strftime('%Y-%m-%d'),
        motion_text=motion_text,
        classification='passage',
        legislative_session=session,
        bill=bill_id,
        bill_chamber='upper' if bill_id[0] == 'S' else 'lower'
    )
    vote_event.pupa_id = url
    vote_event.set_count('yes', len(vote_record['yes']))
    vote_event.set_count('no', len(vote_record['no']))
    vote_event.set_count('excused', len(vote_record['excused']))
    vote_event.set_count('absent', len(vote_record['absent']))
    vote_event.set_count('other', len(vote_record['other']))
    for vote_type in ['yes', 'no', 'excused', 'absent', 'other']:
        for voter in vote_record[vote_type]:
            vote_event.vote(vote_type, voter)

    vote_event.add_source(url)
    return vote_event
Ejemplo n.º 4
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = Vote(
            chamber='upper',
            start_date=date.strftime("%Y-%m-%d"),
            motion_text='Passage',
            # setting 'fail' for now.
            result='fail',
            classification='passage',
            bill=bill
        )
        vote.add_source(url)

        text = convert_pdf(filename, 'text').decode('utf-8')
        os.remove(filename)

        if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url, date)
            return

        data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1]
        data = filter(None, data)
        keymap = dict(yea='yes', nay='no')
        actual_vote = collections.defaultdict(int)
        vote_count = {
            'yes': 0,
            'no': 0,
            'other': 0
        }
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), 'other')
            values = data.pop()
            for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values):
                if name.lower().strip() == 'none.':
                    continue
                name = name.replace('..', '')
                name = re.sub(r'\.$', '', name)
                name = name.strip('-1234567890 \n')
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] +
                                                     vote_count['other']) else 'fail'

        yield vote
Ejemplo n.º 5
0
    def get_vote_event(self, bill, act, votes, result):
        '''Make VoteEvent object from given Bill, action, votes and result.'''
        organization = json.loads(act['organization_id'].lstrip('~'))
        vote_event = VoteEvent(legislative_session=bill.legislative_session,
                               motion_text=act['description'],
                               organization=organization,
                               classification=None,
                               start_date=act['date'],
                               result=result,
                               bill=bill)

        legistar_web, legistar_api = [src['url'] for src in bill.sources]

        vote_event.add_source(legistar_web)
        vote_event.add_source(legistar_api + '/histories')

        for vote in votes:
            raw_option = vote['VoteValueName'].lower()

            if raw_option == 'suspended':
                continue

            clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option)
            vote_event.vote(clean_option, vote['VotePersonName'].strip())

        return vote_event
Ejemplo n.º 6
0
    def parse_roll_call(self, bill, link, chamber, date):
        url = link.attrib['href']
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        xpath = 'string(//div[@class="Column-OneFourth"]/div[3])'
        motion = page.xpath(xpath).strip()
        motion = re.sub(r'\s+', ' ', motion)

        if motion == 'FP':
            motion = 'FINAL PASSAGE'

        if motion == 'FINAL PASSAGE':
            type = 'passage'
        elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion):
            type = 'amendment'
        else:
            type = 'other'
            motion = link.text_content()

        yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text)
        nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text)
        lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text)
        nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text)
        other = lve + nv

        vote = VoteEvent(
            chamber=chamber,
            start_date=tz.localize(date),
            motion_text=motion,
            classification=type,
            result='pass' if yeas > (nays + other) else 'fail',
            bill=bill,
        )
        vote.add_source(url)
        vote.set_count('yes', yeas)
        vote.set_count('no', nays)
        vote.set_count('other', other)

        for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'):
            name = div.text_content().strip()
            name = re.sub(r'^[\s,]+', '', name)
            name = re.sub(r'[\s,]+$', '', name)
            class_attr = div.attrib['class'].lower()
            if 'yea' in class_attr:
                voteval = 'yes'
            elif 'nay' in class_attr:
                voteval = 'no'
            elif 'nvote' in class_attr:
                voteval = 'other'
            elif 'lve' in class_attr:
                voteval = 'other'
            else:
                msg = 'Unrecognized vote val: %s' % class_attr
                raise Exception(msg)
            vote.vote(voteval, name)

        return vote
Ejemplo n.º 7
0
    def scrape_vote(self, bill, vote_id, session):
        vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId'
        form = {
            'rollCallId': vote_id,
            'sort': '',
            'group': '',
            'filter': '',
        }

        page = self.post(url=vote_url, data=form, allow_redirects=True).json()
        if page:
            roll = page['Model']
            vote_chamber = self.chamber_map[roll['ChamberName']]
            # "7/1/16 01:00 AM"
            vote_date = dt.datetime.strptime(roll['TakenAtDateTime'],
                                             '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d')

            # TODO: What does this code mean?
            vote_motion = roll['RollCallVoteType']

            vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail'
            other_count = (int(roll['NotVotingCount']) +
                           int(roll['VacantVoteCount']) +
                           int(roll['AbsentVoteCount']) +
                           int(roll['ConflictVoteCount'])
                           )
            vote = Vote(chamber=vote_chamber,
                        start_date=vote_date,
                        motion_text=vote_motion,
                        result=vote_passed,
                        classification='other',
                        bill=bill.identifier,
                        legislative_session=session
                        )
            vote.add_source(vote_url)
            vote.set_count('yes', roll['YesVoteCount'])
            vote.set_count('no', roll['NoVoteCount'])
            vote.set_count('other', other_count)

            for row in roll['AssemblyMemberVotes']:
                # AssemblyMemberId looks like it should work here,
                # but for some sessions it's bugged to only return session
                try:
                    voter = self.legislators_by_short[str(row['ShortName'])]
                    name = voter['DisplayName']
                except KeyError:
                    self.warning('could not find legislator short name %s',
                                 row['ShortName'])
                    name = row['ShortName']
                if row['SelectVoteTypeCode'] == 'Y':
                    vote.yes(name)
                elif row['SelectVoteTypeCode'] == 'N':
                    vote.no(name)
                else:
                    vote.vote('other', name)

            # bill.add_vote_event(vote)
            yield vote
Ejemplo n.º 8
0
    def _parse_senate_votes(self, vote_data, bill, url):
        vote_datetime = datetime.datetime.strptime(
            vote_data['voteDate'], '%Y-%m-%d')

        if vote_data['voteType'] == 'FLOOR':
            motion = 'Floor Vote'
        elif vote_data['voteType'] == 'COMMITTEE':
            motion = '{} Vote'.format(vote_data['committee']['name'])
        else:
            raise ValueError('Unknown vote type encountered.')

        vote = VoteEvent(
            chamber='upper',
            start_date=vote_datetime.strftime('%Y-%m-%d'),
            motion_text=motion,
            classification='passage',
            result='fail',
            bill=bill,
        )

        vote.add_source(url)

        vote_rolls = vote_data['memberVotes']['items']

        yes_count, no_count, other_count = 0, 0, 0

        # Count all yea votes.
        if 'items' in vote_rolls.get('AYE', {}):
            for legislator in vote_rolls['AYE']['items']:
                vote.yes(legislator['fullName'])
                yes_count += 1

        if 'items' in vote_rolls.get('AYEWR', {}):
            for legislator in vote_rolls['AYEWR']['items']:
                vote.yes(legislator['fullName'])
                yes_count += 1

        # Count all nay votes.
        if 'items' in vote_rolls.get('NAY', {}):
            for legislator in vote_rolls['NAY']['items']:
                vote.no(legislator['fullName'])
                no_count += 1

        # Count all other types of votes.
        other_vote_types = ('EXC', 'ABS', 'ABD')
        for vote_type in other_vote_types:
            if vote_rolls.get(vote_type, []):
                for legislator in vote_rolls[vote_type]['items']:
                    vote.vote('other', legislator['fullName'])
                    other_count += 1

        vote.result = 'pass' if yes_count > no_count else 'fail'
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)

        return vote
Ejemplo n.º 9
0
    def scrape_vote(self, bill, date, motion, url):
        try:
            page = self.get(url).text
            if 'not yet official' in page:
                # Sometimes they link to vote pages before they go live
                pass

            else:
                page = lxml.html.fromstring(page)

                if url.endswith('Senate'):
                    actor = 'upper'
                else:
                    actor = 'lower'

                votevals = ['yes', 'no', 'not voting',  'other']
                count_path = "string(//td[@align = 'center' and contains(., '%s: ')])"
                yes_count = int(page.xpath(count_path % "Yeas").split()[-1])
                no_count = int(page.xpath(count_path % "Nays").split()[-1])
                not_voting_count = int(page.xpath(count_path % "Non Voting").split()[-1])
                other_count = int(page.xpath(count_path % "Present").split()[-1])
                passed = yes_count > no_count + not_voting_count + other_count
                vote = VoteEvent(start_date='2017-03-04', motion_text=motion,
                                 result='pass' if passed else 'fail',
                                 classification='passage',
                                 chamber=actor,
                                 bill=bill)
                try:
                    excused_count = int(page.xpath(count_path % "Excused").split()[-1])
                    vote.set_count('excused', excused_count)
                    votevals.append('excused')
                except:
                    pass
                vote.set_count('yes', yes_count)
                vote.set_count('no', no_count)
                vote.set_count('not voting', not_voting_count)
                vote.set_count('other', other_count)
                vote.add_source(url)

                xpath = (
                    '//*[contains(@class, "ms-standardheader")]/'
                    'following-sibling::table')
                divs = page.xpath(xpath)

                for (voteval, div) in zip(votevals, divs):
                    for a in div.xpath('.//a'):
                        name = a.text_content().strip()
                        if not name:
                            continue
                        else:
                            vote.vote(voteval, name)
                yield vote
        except:
            # sometiems the link is there but is dead
            pass
Ejemplo n.º 10
0
    def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber):
        for action in action_table.xpath('*')[1:]:
            date = action[0].text_content()
            date = dt.datetime.strptime(date, "%m/%d/%Y").strftime('%Y-%m-%d')
            actor = action[1].text_content().upper()
            string = action[2].text_content()
            actor = {
                "S": "upper",
                "H": "lower",
                "D": "legislature",  # "Data Systems",
                "$": "Appropriation measure",
                "CONAM": "Constitutional Amendment"
            }[actor]
            act_type, committees = categorize_action(string)
            # XXX: Translate short-code to full committee name for the
            #      matcher.

            real_committees = []

            if committees:
                for committee in committees:
                    try:
                        committee = self.short_ids[committee]['name']
                        real_committees.append(committee)
                    except KeyError:
                        pass
            act = bill.add_action(string, date, chamber=actor,
                                  classification=act_type)
            for committee in real_committees:
                act.add_related_entity(name=committee, entity_type="organization")
            vote = self.parse_vote(string)
            if vote:
                v, motion = vote
                vote = VoteEvent(start_date=date,
                                 chamber=actor,
                                 bill=bill_id,
                                 bill_chamber=bill_chamber,
                                 legislative_session=session,
                                 motion_text=motion,
                                 result='pass' if 'passed' in string.lower() else 'fail',
                                 classification='passage')
                vote.add_source(url)
                vote.set_count('yes', int(v['n_yes'] or 0))
                vote.set_count('no', int(v['n_no'] or 0))
                vote.set_count('not voting', int(v['n_excused'] or 0))
                for voter in split_specific_votes(v['yes']):
                    vote.yes(voter)
                for voter in split_specific_votes(v['yes_resv']):
                    vote.yes(voter)
                for voter in split_specific_votes(v['no']):
                    vote.no(voter)
                for voter in split_specific_votes(v['excused']):
                    vote.vote('not voting', voter)

                yield vote
Ejemplo n.º 11
0
    def handle_page(self):
        (date, ) = self.doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()')
        date = datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p'
                                          ).isoformat().replace('T', ' ')

        totals = self.doc.xpath('//table//table')[-1].text_content()
        totals = re.sub(r'(?mu)\s+', " ", totals).strip()
        (yes_count, no_count, other_count) = [int(x) for x in re.search(
            r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+'
            'Total Missed:\s+(\d+)', totals).groups()]
        result = 'pass' if yes_count > no_count else 'fail'

        (committee, ) = self.doc.xpath(
            '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()')
        (action, ) = self.doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()')
        motion = "{} ({})".format(action, committee)

        vote = VoteEvent(start_date=date,
                         bill=self.kwargs['bill'],
                         chamber='lower',
                         motion_text=motion,
                         result=result,
                         classification='committee',
                         )
        vote.add_source(self.url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('not voting', other_count)

        for member_vote in self.doc.xpath('//table//table//table//td'):
            if not member_vote.text_content().strip():
                continue

            (member, ) = member_vote.xpath('span[2]//text()')
            (member_vote, ) = member_vote.xpath('span[1]//text()')

            if member_vote == "Y":
                vote.yes(member)
            elif member_vote == "N":
                vote.no(member)
            elif member_vote == "-":
                vote.vote('not voting', member)
            # Parenthetical votes appear to not be counted in the
            # totals for Yea, Nay, _or_ Missed
            elif re.search(r'\([YN]\)', member_vote):
                continue
            else:
                raise ValueError("Unknown vote type found: {}".format(member_vote))

        yield vote
Ejemplo n.º 12
0
    def parse_vote(self, bill, actor, date, motion, url, uniqid):
        page = self.get(url).text
        bill.add_source(url)
        vote_re = re.compile('YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)'
                             '(.*)ABSENT( OR NOT VOTING)? -?\s?'
                             '(\d+)(.*)',
                             re.MULTILINE | re.DOTALL)
        match = vote_re.search(page)
        yes_count = int(match.group(1))
        no_count = int(match.group(3))
        other_count = int(match.group(6))

        if yes_count > no_count:
            passed = True
        else:
            passed = False

        if actor == 'upper' or actor == 'lower':
            vote_chamber = actor
        else:
            vote_chamber = ''

        vote = Vote(chamber=vote_chamber,
                    start_date=date,
                    motion_text=motion,
                    result='pass' if passed else 'fail',
                    identifier=str(uniqid),
                    classification='passage',
                    bill=bill)
        vote.add_source(url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)

        yes_votes = re.split('\s{2,}', match.group(2).strip())
        no_votes = re.split('\s{2,}', match.group(4).strip())
        other_votes = re.split('\s{2,}', match.group(7).strip())

        for yes in yes_votes:
            if yes:
                vote.yes(yes)
        for no in no_votes:
            if no:
                vote.no(no)
        for other in other_votes:
            if other:
                vote.vote('other', other)

        yield vote
Ejemplo n.º 13
0
    def scrape_votes(self, bill):
        bill_num = bill.identifier.split()[1]

        url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/"
               "GetRollCalls?billNumber=%s&biennium=%s" % (
                   bill_num, self.biennium))
        page = self.get(url)
        page = lxml.etree.fromstring(page.content)

        for rc in xpath(page, "//wa:RollCall"):
            motion = xpath(rc, "string(wa:Motion)")
            seq_no = xpath(rc, "string(wa:SequenceNumber)")

            date = xpath(rc, "string(wa:VoteDate)").split("T")[0]
            date = datetime.datetime.strptime(date, "%Y-%m-%d").date()

            yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)"))
            no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)"))
            abs_count = int(
                xpath(rc, "string(wa:AbsentVotes/wa:Count)"))
            ex_count = int(
                xpath(rc, "string(wa:ExcusedVotes/wa:Count)"))

            other_count = abs_count + ex_count

            agency = xpath(rc, "string(wa:Agency)")
            chamber = {'House': 'lower', 'Senate': 'upper'}[agency]

            vote = Vote(chamber=chamber, start_date=date,
                        motion_text='{} (#{})'.format(motion, seq_no),
                        result='pass' if yes_count > (no_count + other_count) else 'fail',
                        classification='other', bill=bill)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('other', other_count)
            vote.add_source(url)
            for sv in xpath(rc, "wa:Votes/wa:Vote"):
                name = xpath(sv, "string(wa:Name)")
                vtype = xpath(sv, "string(wa:VOte)")

                if vtype == 'Yea':
                    vote.yes(name)
                elif vtype == 'Nay':
                    vote.no(name)
                else:
                    vote.vote('other', name)

            yield vote
Ejemplo n.º 14
0
    def parse_committee_votes(self, bill, url):
        bill.add_source(url)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        chamber = ('upper' if 'Senate' in doc.xpath('string(//h1)') else 'lower')
        committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip()
        for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"):

            # Date
            for fmt in ("%m/%d/%Y", "%m-%d-%Y"):
                date = link.xpath('../../td')[0].text_content()
                try:
                    date = datetime.datetime.strptime(date, fmt)
                except ValueError:
                    continue
                break

            # Motion
            motion = link.text_content().split(' - ')[-1].strip()
            motion = 'Committee vote (%s): %s' % (committee, motion)

            # Roll call
            vote_url = link.attrib['href']
            rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url)

            vote = VoteEvent(
                chamber=chamber,
                start_date=tz.localize(date),
                motion_text=motion,
                classification='other',
                result='pass' if rollcall['passed'] else 'fail',
                bill=bill,
            )
            vote.pupa_id = vote_url
            vote.set_count('yes', rollcall['yes_count'])
            vote.set_count('no', rollcall['no_count'])
            vote.set_count('other', rollcall['other_count'])

            for voteval in ('yes', 'no', 'other'):
                for name in rollcall.get(voteval + '_votes', []):
                    vote.vote(voteval, name)

            vote.add_source(url)
            vote.add_source(vote_url)

            yield vote
Ejemplo n.º 15
0
    def scrape_chamber_votes(self, chamber, session):
        url = {
            "upper": "%s/%s" % (RI_URL_BASE, "SVotes"),
            "lower": "%s/%s" % (RI_URL_BASE, "HVotes")
        }[chamber]
        action = "%s/%s" % (url, "votes.asp")
        dates = self.get_vote_dates(url, session)
        for date in dates:
            votes = self.parse_vote_page(self.post_to(action, date), url, session)
            for vote_dict in votes:
                for vote in vote_dict.values():
                    count = vote['count']
                    chamber = {
                        "H": "lower",
                        "S": "upper"
                    }[vote['meta']['chamber']]

                    try:
                        bill_id = self._bill_id_by_type[(chamber, vote['meta']['bill'])]
                    except KeyError:
                        self.warning('no such bill_id %s %s', chamber, vote['meta']['bill'])
                        continue

                    v = VoteEvent(
                        chamber=chamber,
                        start_date=vote['time'].strftime('%Y-%m-%d'),
                        motion_text=vote['meta']['extra']['motion'],
                        result='pass' if count['passage'] else 'fail',
                        classification='passage',
                        legislative_session=session,
                        bill=bill_id,
                        bill_chamber=chamber,
                    )
                    v.set_count('yes', int(count['YEAS']))
                    v.set_count('no', int(count['NAYS']))
                    v.set_count('other', int(count['NOT VOTING']))
                    v.add_source(vote['source'])
                    v.pupa_id = vote['source']

                    for vt in vote['votes']:
                        key = {
                            'Y': 'yes',
                            'N': 'no',
                        }.get(vt['vote'], 'other')
                        v.vote(key, vt['name'])
                    yield v
Ejemplo n.º 16
0
    def scrape_votes(self, bill, page):
        base_url = 'https://apps.azleg.gov/api/BillStatusFloorAction'
        for header in page['FloorHeaders']:
            params = {
                'billStatusId': page['BillId'],
                'billStatusActionId': header['BillStatusActionId'],
                'includeVotes': 'true',
            }
            resp = self.get(base_url, params=params)
            actions = json.loads(resp.content.decode('utf-8'))

            for action in actions:
                if action['Action'] == 'No Action':
                    continue
                action_date = datetime.datetime.strptime(action['ReportDate'], '%Y-%m-%dT%H:%M:%S')
                vote = VoteEvent(
                    chamber={
                        'S': 'upper',
                        'H': 'lower',
                    }[header['LegislativeBody']],
                    motion_text=action['Action'],
                    classification='passage',
                    result=(
                        'pass'
                        if action['UnanimouslyAdopted'] or action['Ayes'] > action['Nays']
                        else 'fail'
                    ),
                    start_date=action_date.strftime('%Y-%m-%d'),
                    bill=bill,
                )
                vote.add_source(resp.url)
                vote.set_count('yes', action['Ayes'] or 0)
                vote.set_count('no', action['Nays'] or 0)
                vote.set_count('other', (action['Present'] or 0))
                vote.set_count('absent', (action['Absent'] or 0))
                vote.set_count('excused', (action['Excused'] or 0))
                vote.set_count('not voting', (action['NotVoting'] or 0))

                for v in action['Votes']:
                    vote_type = {
                        'Y': 'yes',
                        'N': 'no',
                    }.get(v['Vote'], 'other')
                    vote.vote(vote_type, v['Legislator']['FullName'])
                vote.pupa_id = resp.url+str(action['ReferralNumber'])
                yield vote
Ejemplo n.º 17
0
    def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source):
        """
        takes the actor, date and row element and returns a Vote object
        """
        spans = row.xpath('.//span')
        motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip()
        motion = motion if motion else "passage"
        passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3)
        yes_votes = self.get_names(spans[1].tail)
        no_votes = self.get_names(spans[2].tail)

        other_votes = []
        for span in spans[3:]:
            if span.text.startswith(('Absent', 'Excused')):
                other_votes += self.get_names(span.tail)
        for key, val in {'adopted': 'pass', 'passed': 'pass', 'failed': 'fail'}.items():
            if key in passed.lower():
                passed = val
                break
        vote = VoteEvent(chamber=actor,
                         start_date=date,
                         motion_text=motion,
                         bill=bill_id,
                         bill_chamber=bill_chamber,
                         result=passed,
                         classification="passage",
                         legislative_session=session)
        vote.add_source(source)
        vote.set_count('yes', int(yes_count))
        vote.set_count('no', int(no_count))
        vote.set_count('absent', int(other_count))
        for name in yes_votes:
            if name and name != 'None':
                vote.yes(name)
        for name in no_votes:
            if name and name != 'None':
                vote.no(name)
        for name in other_votes:
            if name and name != 'None':
                vote.vote('absent', name)
        yield vote
Ejemplo n.º 18
0
    def asvote(self):
        v = VoteEvent(
            chamber=self.chamber(),
            start_date=self.date(),
            motion_text=self.motion(),
            result='pass' if self.passed() else 'fail',
            classification='passage',
            bill=self.bill,
        )
        v.set_count('yes', self.yes_count())
        v.set_count('no', self.no_count())
        v.set_count('other', self.other_count())

        for voter in self.yes_votes():
            v.yes(voter)
        for voter in self.no_votes():
            v.no(voter)
        for voter in self.other_votes():
            v.vote('other', voter)
        v.add_source(self.url)
        return v
Ejemplo n.º 19
0
    def scrape_vote(self, bill, motion, url):
        page = self.get(url, retry_on_404=True).text
        page = lxml.html.fromstring(page)

        yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0]
        yes_count = int(yeas_cell.xpath("string(following-sibling::td)"))

        nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0]
        no_count = int(nays_cell.xpath("string(following-sibling::td)"))

        abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0]
        abs_count = int(abs_cell.xpath("string(following-sibling::td)"))

        ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0]
        ex_count = int(ex_cell.xpath("string(following-sibling::td)"))

        other_count = abs_count + ex_count

        if 'chamber=House' in url:
            chamber = 'lower'
        elif 'chamber=Senate' in url:
            chamber = 'upper'

        date_cell = page.xpath("//td[text() = 'Date:']")[0]
        date = date_cell.xpath("string(following-sibling::td)")
        try:
            date = datetime.datetime.strptime(date, "%B %d, %Y")
        except ValueError:
            date = datetime.datetime.strptime(date, "%b. %d, %Y")

        outcome_cell = page.xpath("//td[text()='Outcome:']")[0]
        outcome = outcome_cell.xpath("string(following-sibling::td)")

        vote = VoteEvent(
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            result='pass' if outcome == 'PREVAILS' else 'fail',
            classification='passage',
            bill=bill,
        )
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)
        vote.add_source(url)

        member_cell = page.xpath("//td[text() = 'Member']")[0]
        for row in member_cell.xpath("../../tr")[1:]:
            name = row.xpath("string(td[2])")
            # name = name.split(" of ")[0]

            vtype = row.xpath("string(td[4])")
            if vtype == 'Y':
                vote.vote('yes', name)
            elif vtype == 'N':
                vote.vote('no', name)
            elif vtype == 'X' or vtype == 'E':
                vote.vote('other', name)

        yield vote
Ejemplo n.º 20
0
    def scrape_vote(self, bill, vote_json, session):

        if vote_json['amendmentNumber']:
            motion = '{}: {}'.format(
                vote_json['amendmentNumber'], vote_json['action'])
        else:
            motion = vote_json['action']

        result = 'pass' if vote_json['yesVotesCount'] > vote_json['noVotesCount'] else 'fail'

        v = VoteEvent(
            chamber=self.chamber_abbrev_map[vote_json['chamber']],
            start_date=self.parse_local_date(vote_json['voteDate']),
            motion_text=motion,
            result=result,
            legislative_session=session,
            bill=bill,
            classification='other',
        )

        v.set_count(option='yes', value=vote_json['yesVotesCount'])
        v.set_count('no', vote_json['noVotesCount'])
        v.set_count('absent', vote_json['absentVotesCount'])
        v.set_count('excused', vote_json['excusedVotesCount'])
        v.set_count('other', vote_json['conflictVotesCount'])

        for name in vote_json['yesVotes'].split(','):
            if name.strip():
                v.yes(name.strip())

        for name in vote_json['noVotes'].split(','):
            if name.strip():
                v.no(name.strip())

        # add votes with other classifications
        # option can be 'yes', 'no', 'absent',
        # 'abstain', 'not voting', 'paired', 'excused'
        for name in vote_json['absentVotes'].split(','):
            if name.strip():
                v.vote(option="absent",
                       voter=name)

        for name in vote_json['excusedVotes'].split(','):
            if name.strip():
                v.vote(option="excused",
                       voter=name)

        for name in vote_json['conflictVotes'].split(','):
            if name.strip():
                v.vote(option="other",
                       voter=name)

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(
            session, vote_json['billNumber'])
        v.add_source(source_url)

        yield v
Ejemplo n.º 21
0
    def parse_vote(self, bill, link):
        member_doc = lxml.html.fromstring(self.get(link).text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        opinions = member_doc.xpath("//div[@id='main_content']/h3/text()")
        if len(opinions) > 0:
            temp = opinions[0].split()
            vote_chamber = temp[0]
            vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y')
            vote_status = " ".join(temp[2:-2])
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except:
                    pass
                if "yea" in i.lower():
                    yes_count = count
                elif "nay" in i.lower():
                    no_count = count
                elif "present" in i.lower():
                    p_count = count
                elif "absent" in i.lower():
                    a_count = count
            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime('%Y-%m-%d'),
                chamber=vote_chamber,
                motion_text=vote_status,
                result='pass' if yes_count > no_count else 'fail',
                classification='passage',
            )
            vote.pupa_id = link

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('abstain', p_count)
            vote.set_count('absent', a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote('yes', re.sub(',', '', a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote('no', re.sub(',', '', a_links[i]).split()[0])
                else:
                    vote.vote('other', re.sub(',', '', a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Ejemplo n.º 22
0
    def scrape_vote(self, bill, motion, url):
        page = self.get(url, retry_on_404=True).text
        page = lxml.html.fromstring(page)

        yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0]
        yes_count = int(yeas_cell.xpath("string(following-sibling::td)"))

        nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0]
        no_count = int(nays_cell.xpath("string(following-sibling::td)"))

        abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0]
        abs_count = int(abs_cell.xpath("string(following-sibling::td)"))

        ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0]
        ex_count = int(ex_cell.xpath("string(following-sibling::td)"))

        other_count = abs_count + ex_count

        if "chamber=House" in url:
            chamber = "lower"
        elif "chamber=Senate" in url:
            chamber = "upper"

        date_cell = page.xpath("//td[text() = 'Date:']")[0]
        date = date_cell.xpath("string(following-sibling::td)")
        try:
            date = datetime.datetime.strptime(date, "%B %d, %Y")
        except ValueError:
            date = datetime.datetime.strptime(date, "%b. %d, %Y")

        outcome_cell = page.xpath("//td[text()='Outcome:']")[0]
        outcome = outcome_cell.xpath("string(following-sibling::td)")

        vote = VoteEvent(
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            result="pass" if outcome == "PREVAILS" else "fail",
            classification="passage",
            bill=bill,
        )
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)
        vote.add_source(url)
        vote.pupa_id = url

        member_cell = page.xpath("//td[text() = 'Member']")[0]
        for row in member_cell.xpath("../../tr")[1:]:
            name = row.xpath("string(td[2])")
            # name = name.split(" of ")[0]

            vtype = row.xpath("string(td[4])")
            if vtype == "Y":
                vote.vote("yes", name)
            elif vtype == "N":
                vote.vote("no", name)
            elif vtype == "X" or vtype == "E":
                vote.vote("other", name)

        yield vote
Ejemplo n.º 23
0
    def scrape_assembly_votes(self, session, bill, assembly_url, bill_id):

        # parse the bill data page, finding the latest html text
        url = assembly_url + "&Floor%26nbspVotes=Y"

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        if "Votes:" in doc.text_content():
            vote_motions = []
            additional_votes_on_motion = 2
            for table in doc.xpath("//table"):

                date = table.xpath('caption/span[contains(., "DATE:")]')
                date = next(date[0].itersiblings()).text
                date = datetime.datetime.strptime(date, "%m/%d/%Y")
                date = eastern.localize(date)
                date = date.isoformat()

                spanText = table.xpath("caption/span/text()")
                motion = spanText[2].strip() + spanText[3].strip()
                if motion in vote_motions:
                    motion = motion + f" - Vote {additional_votes_on_motion}"
                    additional_votes_on_motion += 1
                else:
                    vote_motions.append(motion)

                votes = (table.xpath("caption/span/span")[0].text.split(":")
                         [1].split("/"))
                yes_count, no_count = map(int, votes)
                passed = yes_count > no_count
                vote = VoteEvent(
                    chamber="lower",
                    start_date=date,
                    motion_text=motion,
                    bill=bill,
                    result="pass" if passed else "fail",
                    classification="passage",
                )

                vote.set_count("yes", yes_count)
                vote.set_count("no", no_count)
                absent_count = 0
                excused_count = 0
                tds = table.xpath("tr/td/text()")
                votes = [tds[i:i + 2] for i in range(0, len(tds), 2)]

                vote_dictionary = {
                    "Y": "yes",
                    "NO": "no",
                    "ER": "excused",
                    "AB": "absent",
                    "NV": "not voting",
                    "EL": "other"
                }

                for vote_pair in votes:
                    name, vote_val = vote_pair
                    vote.vote(vote_dictionary[vote_val], name)
                    if vote_val == "AB":
                        absent_count += 1
                    elif vote_val == "ER":
                        excused_count += 1

                vote.set_count("absent", absent_count)
                vote.set_count("excused", excused_count)
                vote.add_source(url)
                vote.pupa_id = url + motion + spanText[1]

                yield vote
Ejemplo n.º 24
0
    def process_vote(self, votes, url, base_url, bill, legislators,
                     chamber_dict, vote_results):
        for v in votes["items"]:
            try:
                v["yeas"]
            except KeyError:
                # sometimes the actual vote is buried a second layer deep
                v = self.get(base_url + v["link"]).json()
                try:
                    v["yeas"]
                except KeyError:
                    self.logger.warning("No vote info available, skipping")
                    continue

            try:
                chamber = chamber_dict[v["chamber"]]
            except KeyError:
                chamber = "lower" if "house" in v["apn"] else "upper"
            try:
                date = self._tz.localize(
                    datetime.datetime.strptime(v["date"], "%m/%d/%y"))
                date = "{:%Y-%m-%d}".format(date)
            except KeyError:
                try:
                    date = self._tz.localize(
                        datetime.datetime.strptime(v["occurred"], "%m/%d/%y"))
                    date = "{:%Y-%m-%d}".format(date)
                except KeyError:
                    self.logger.warning("No date found for vote, skipping")
                    continue
            try:
                motion = v["action"]
            except KeyError:
                motion = v["motiontype"]

            # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip
            if (not motion and isinstance(v['yeas'], str)
                    and isinstance(v['nays'], str)):
                waringText = 'Malformed JSON found for vote ("revno" of {}); skipping'
                self.warning(waringText.format(v['revno']))
                continue

            result = v.get("results") or v.get("passed")
            if result is None:
                if len(v['yeas']) > len(v['nays']):
                    result = "passed"
                else:
                    result = "failed"

            passed = vote_results[result.lower()]
            if "committee" in v:
                vote = VoteEvent(
                    chamber=chamber,
                    start_date=date,
                    motion_text=motion,
                    result='pass' if passed else 'fail',
                    # organization=v["committee"],
                    bill=bill,
                    classification='passed')
            else:
                vote = VoteEvent(chamber=chamber,
                                 start_date=date,
                                 motion_text=motion,
                                 result='pass' if passed else 'fail',
                                 classification='passed',
                                 bill=bill)
            vote.pupa_id = str(v['revno'])
            # the yea and nay counts are not displayed, but vote totals are
            # and passage status is.
            yes_count = 0
            no_count = 0
            absent_count = 0
            excused_count = 0
            for voter_id in v["yeas"]:
                vote.yes(legislators[voter_id])
                yes_count += 1
            for voter_id in v["nays"]:
                vote.no(legislators[voter_id])
                no_count += 1
            if "absent" in v:
                for voter_id in v["absent"]:
                    vote.vote('absent', legislators[voter_id])
                    absent_count += 1
            if "excused" in v:
                for voter_id in v["excused"]:
                    vote.vote('excused', legislators[voter_id])
                    excused_count += 1

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('absent', absent_count)
            vote.set_count('excused', excused_count)
            # check to see if there are any other things that look
            # like vote categories, throw a warning if so
            for key, val in v.items():
                if (type(val) == list and len(val) > 0
                        and key not in ["yeas", "nays", "absent", "excused"]):
                    if val[0] in legislators:
                        self.logger.warning(
                            "{k} looks like a vote type that's not being counted."
                            " Double check it?".format(k=key))
            vote.add_source(url)

            yield vote
Ejemplo n.º 25
0
    def parse_roll_call(self, bill, link, chamber, date):
        url = link.attrib['href']
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        xpath = 'string(//div[@class="Column-OneFourth"]/div[3])'
        motion = page.xpath(xpath).strip()
        motion = re.sub(r'\s+', ' ', motion)

        if motion == 'FP':
            motion = 'FINAL PASSAGE'

        if motion == 'FINAL PASSAGE':
            type = 'passage'
        elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion):
            type = 'amendment'
        else:
            type = 'other'
            motion = link.text_content()

        yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text)
        nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text)
        lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text)
        nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text)
        other = lve + nv

        vote = VoteEvent(
            chamber=chamber,
            start_date=tz.localize(date),
            motion_text=motion,
            classification=type,
            result='pass' if yeas > (nays + other) else 'fail',
            bill=bill,
        )
        # pupa_id situation here is a bit weird, same vote can be used for
        # multiple bills see:
        # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11       # noqa
        # so we toss the bill id onto the end of the URL
        vote.pupa_id = url + '#' + bill.identifier
        vote.add_source(url)
        vote.set_count('yes', yeas)
        vote.set_count('no', nays)
        vote.set_count('other', other)

        for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'):
            name = div.text_content().strip()
            name = re.sub(r'^[\s,]+', '', name)
            name = re.sub(r'[\s,]+$', '', name)
            class_attr = div.attrib['class'].lower()
            if 'yea' in class_attr:
                voteval = 'yes'
            elif 'nay' in class_attr:
                voteval = 'no'
            elif 'nvote' in class_attr:
                voteval = 'other'
            elif 'lve' in class_attr:
                voteval = 'other'
            else:
                msg = 'Unrecognized vote val: %s' % class_attr
                raise Exception(msg)
            vote.vote(voteval, name)

        return vote
Ejemplo n.º 26
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, 'text')
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)
        date = None

        for idx, line in enumerate(lines):
            line = line.rstrip().decode('utf-8')
            match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line)
            if match:
                motion = (lines[idx - 2].strip()).decode('utf-8')
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()
                ]

                exc_match = re.search(r'EXCUSED: (\d+)', line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith('ADOPTED') or line.endswith('PASSED'):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line)
            if match:
                vote_type = {
                    'YEAS': 'yes',
                    'NAYS': 'no',
                    'NOT VOTING': 'other',
                    'EXCUSED': 'other',
                    'PAIRED': 'paired'
                }[match.group(1)]
                continue

            if vote_type == 'paired':
                for part in line.split('   '):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(r'([^\(]+)\((YEA|NAY)\)',
                                               line).groups()
                    name = name.strip()
                    if pair_type == 'YEA':
                        votes['yes'].append(name)
                    elif pair_type == 'NAY':
                        votes['no'].append(name)
            elif vote_type:
                for name in line.split('   '):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)
        if date:
            vote = VoteEvent(chamber='lower',
                             start_date=date.strftime("%Y-%m-%d"),
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification='passage',
                             bill=bill)

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('other', other_count)
            vote.add_source(url)
            vote.pupa_id = url

            for key, values in votes.items():
                for value in values:
                    vote.vote(key, value)

            yield vote
        else:
            self.warning("Syntax Error/Warning using 'convert_pdf'")
Ejemplo n.º 27
0
    def handle_page(self):
        MOTION_INDEX = 4
        TOTALS_INDEX = 6
        VOTE_START_INDEX = 9

        if len(self.lines) < 2:
            self.scraper.warning("Bad PDF! " + self.url)
            return

        motion = self.lines[MOTION_INDEX].strip()
        # Sometimes there is no motion name, only "Passage" in the line above
        if (not motion and
                not self.lines[MOTION_INDEX - 1].startswith("Calendar Page:")):
            motion = self.lines[MOTION_INDEX - 1]
            MOTION_INDEX -= 1
            TOTALS_INDEX -= 1
            VOTE_START_INDEX -= 1
        else:
            assert motion, "Floor vote's motion name appears to be empty"

        for _extra_motion_line in range(2):
            MOTION_INDEX += 1
            if self.lines[MOTION_INDEX].strip():
                motion = "{}, {}".format(motion,
                                         self.lines[MOTION_INDEX].strip())
                TOTALS_INDEX += 1
                VOTE_START_INDEX += 1
            else:
                break

        (yes_count, no_count, nv_count) = [
            int(x) for x in re.search(
                r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$',
                self.lines[TOTALS_INDEX]).groups()
        ]
        result = 'pass' if yes_count > no_count else 'fail'

        vote = VoteEvent(
            start_date=self.kwargs['date'],
            chamber=self.kwargs['chamber'],
            bill=self.kwargs['bill'],
            motion_text=motion,
            result=result,
            classification='passage',
        )
        vote.add_source(self.url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('not voting', nv_count)

        for line in self.lines[VOTE_START_INDEX:]:
            if not line.strip():
                break

            if " President " in line:
                line = line.replace(" President ", " ")
            elif " Speaker " in line:
                line = line.replace(" Speaker ", " ")

            # Votes follow the pattern of:
            # [vote code] [member name]-[district number]
            for vtype, member in re.findall(
                    r'\s*(Y|N|EX|AV)\s+(.*?)-\d{1,3}\s*', line):
                vtype = {
                    'Y': 'yes',
                    'N': 'no',
                    'EX': 'excused',
                    'AV': 'abstain'
                }[vtype]
                vote.vote(vtype, member)

        # check totals line up
        yes_count = no_count = nv_count = 0
        for vc in vote.counts:
            if vc['option'] == 'yes':
                yes_count = vc['value']
            elif vc['option'] == 'no':
                no_count = vc['value']
            else:
                nv_count += vc['value']

        for vr in vote.votes:
            if vr['option'] == 'yes':
                yes_count -= 1
            elif vr['option'] == 'no':
                no_count -= 1
            else:
                nv_count -= 1

        if yes_count != 0 or no_count != 0:
            raise ValueError('vote count incorrect: ' + self.url)

        if nv_count != 0:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.scraper.info(
                "Votes don't add up; looking for additional ones")
            for line in self.lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}',
                                         line):
                    vote.vote('not voting', member)
        yield vote
Ejemplo n.º 28
0
    def scrape_vote(self, bill, motion, url):
        page = self.get(url, retry_on_404=True).text
        page = lxml.html.fromstring(page)

        yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0]
        yes_count = int(yeas_cell.xpath("string(following-sibling::td)"))

        nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0]
        no_count = int(nays_cell.xpath("string(following-sibling::td)"))

        abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0]
        abs_count = int(abs_cell.xpath("string(following-sibling::td)"))

        ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0]
        ex_count = int(ex_cell.xpath("string(following-sibling::td)"))

        other_count = abs_count + ex_count

        if 'chamber=House' in url:
            chamber = 'lower'
        elif 'chamber=Senate' in url:
            chamber = 'upper'

        date_cell = page.xpath("//td[text() = 'Date:']")[0]
        date = date_cell.xpath("string(following-sibling::td)")
        try:
            date = datetime.datetime.strptime(date, "%B %d, %Y")
        except ValueError:
            date = datetime.datetime.strptime(date, "%b. %d, %Y")

        outcome_cell = page.xpath("//td[text()='Outcome:']")[0]
        outcome = outcome_cell.xpath("string(following-sibling::td)")

        vote = VoteEvent(
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            result='pass' if outcome == 'PREVAILS' else 'fail',
            classification='passage',
            bill=bill,
        )
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)
        vote.add_source(url)
        vote.pupa_id = url

        member_cell = page.xpath("//td[text() = 'Member']")[0]
        for row in member_cell.xpath("../../tr")[1:]:
            name = row.xpath("string(td[2])")
            name = name.split(" of ")[0]

            vtype = row.xpath("string(td[4])")
            if vtype == 'Y':
                vote.vote('yes', name)
            elif vtype == 'N':
                vote.vote('no', name)
            elif vtype == 'X' or vtype == 'E':
                vote.vote('other', name)

        yield vote
Ejemplo n.º 29
0
    def scrape_vote(self, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath(
                '//td/b/font[text()="MOTION:"]/../../following-sibling::td/font/text()')[0]
        except:
            self.warning("Vote Summary Page Broken ")
            return

        if 'withdrawn' not in motion:
            # Every table row after the one with VOTE in a td/div/b/font
            rolls = page.xpath('//tr[preceding-sibling::tr/td/div/b/font/text()="VOTE"]')

            count_row = rolls[-1]
            yes_count = count_row.xpath('.//b/font[normalize-space(text())="YES:"]'
                                        '/../following-sibling::font[1]/text()')[0]
            no_count = count_row.xpath('.//b/font[normalize-space(text())="NO:"]'
                                       '/../following-sibling::font[1]/text()')[0]
            exc_count = count_row.xpath('.//b/font[normalize-space(text())="EXC:"]'
                                        '/../following-sibling::font[1]/text()')[0]
            nv_count = count_row.xpath('.//b/font[normalize-space(text())="ABS:"]'
                                       '/../following-sibling::font[1]/text()')[0]

            if count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]'
                               '/../following-sibling::b[1]/font/text()'):
                final = count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]'
                                        '/../following-sibling::b[1]/font/text()')[0]
                passed = ('pass' in final.lower() or int(yes_count) > int(no_count))
            elif 'passed without objection' in motion.lower():
                passed = True
                yes_count = int(len(rolls[:-2]))
            else:
                self.warning("No vote breakdown found for %s" % vote_url)
                return

            vote = VoteEvent(chamber=chamber,
                             start_date=self._tz.localize(date),
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             bill=bill,
                             classification='passage',
                             )
            vote.pupa_id = vote_url
            vote.set_count('yes', int(yes_count))
            vote.set_count('no', int(no_count))
            vote.set_count('excused', int(exc_count))
            vote.set_count('not voting', int(nv_count))
            vote.add_source(vote_url)

            for roll in rolls[:-2]:
                voter = roll.xpath('td[2]/div/font')[0].text_content()
                voted = roll.xpath('td[3]/div/font')[0].text_content().strip()

                if voted:
                    if 'Yes' in voted:
                        vote.yes(voter)
                    elif 'No' in voted:
                        vote.no(voter)
                    elif 'Excused' in voted:
                        vote.vote('excused', voter)
                    else:
                        vote.vote("other", voter)

                elif 'passed without objection' in motion.lower() and voter:
                    vote.yes(voter)
            yield vote
Ejemplo n.º 30
0
    def scrape_votes(self, bill, doc):
        vote_tr_path = ('//h6[@id="vote-header"]'
                        '/ancestor::div[contains(@class, "gray-card")]'
                        '//div[contains(@class, "card-body")]'
                        '//div[@class="row"]')

        for vote_row in doc.xpath(vote_tr_path):
            entries = [
                each.text_content() for each in vote_row.xpath('div')[1:-1:2]
            ]
            date, subject, rcs, aye, no, nv, abs, exc, total = entries
            result = vote_row.xpath('div/a')[0]
            result_text = result.text
            result_link = result.get('href')

            if 'H' in rcs:
                chamber = 'lower'
            elif 'S' in rcs:
                chamber = 'upper'

            date = eastern.localize(
                dt.datetime.strptime(date.replace('.', ''),
                                     "%m/%d/%Y %H:%M %p"))
            date = date.isoformat()

            ve = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=subject,
                result='pass' if 'PASS' in result_text else 'fail',
                bill=bill,
                classification='passage',  # TODO: classify votes
            )
            ve.set_count('yes', int(aye))
            ve.set_count('no', int(no))
            ve.set_count('not voting', int(nv))
            ve.set_count('absent', int(abs))
            ve.set_count('excused', int(exc))
            ve.add_source(result_link)

            data = self.get(result_link).text
            vdoc = lxml.html.fromstring(data)

            # only one table that looks like this
            vote_table, = vdoc.xpath('//table[@cellpadding="5"]')

            # skip party row
            for row in vote_table.xpath('tr')[1:]:
                vote_type, dems, reps = row.xpath('td')

                vote_type = vote_type.text_content()
                if 'Ayes' in vote_type:
                    vote_type = 'yes'
                elif 'Noes' in vote_type:
                    vote_type = 'no'
                elif 'Not Voting' in vote_type:
                    vote_type = 'not voting'
                elif 'Exc. Absence' in vote_type:
                    vote_type = 'absent'
                elif 'Exc. Vote' in vote_type:
                    vote_type = 'excused'
                else:
                    raise ValueError('unknown vote type: ' + vote_type)

                for name in (vote_list_to_names(dems.text_content()) +
                             vote_list_to_names(reps.text_content())):
                    ve.vote(vote_type, name)

            yield ve
Ejemplo n.º 31
0
    def scrape_votes_for_chamber(self, chamber, vote_data, bill, link):
        raw_vote_data = re.split('\w+? by [\w ]+?\s+-', vote_data.strip())[1:]

        motion_count = 1

        for raw_vote in raw_vote_data:
            raw_vote = raw_vote.split(
                u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0')
            motion = raw_vote[0]

            vote_date = re.search('(\d+/\d+/\d+)', motion)
            if vote_date:
                vote_date = datetime.datetime.strptime(vote_date.group(),
                                                       '%m/%d/%Y')

            passed = ('Passed' in motion or 'Recommended for passage' in motion
                      or 'Rec. for pass' in motion or 'Adopted' in raw_vote[1])
            vote_regex = re.compile('\d+$')
            aye_regex = re.compile('^.+voting aye were: (.+) -')
            no_regex = re.compile('^.+voting no were: (.+) -')
            not_voting_regex = re.compile(
                '^.+present and not voting were: (.+) -')
            yes_count = 0
            no_count = 0
            not_voting_count = 0
            ayes = []
            nos = []
            not_voting = []

            for v in raw_vote[1:]:
                v = v.strip()
                if v.startswith('Ayes...') and vote_regex.search(v):
                    yes_count = int(vote_regex.search(v).group())
                elif v.startswith('Noes...') and vote_regex.search(v):
                    no_count = int(vote_regex.search(v).group())
                elif v.startswith(
                        'Present and not voting...') and vote_regex.search(v):
                    not_voting_count += int(vote_regex.search(v).group())
                elif aye_regex.search(v):
                    ayes = aye_regex.search(v).groups()[0].split(', ')
                elif no_regex.search(v):
                    nos = no_regex.search(v).groups()[0].split(', ')
                elif not_voting_regex.search(v):
                    not_voting += not_voting_regex.search(v).groups()[0].split(
                        ', ')

            motion = motion.strip()
            motion = motion.replace('&AMP;', '&')  # un-escape ampersands
            if motion in self._seen_votes:
                motion = '{} ({})'.format(motion, motion_count)
                motion_count += 1
            self._seen_votes.add(motion)

            vote = VoteEvent(
                motion_text=motion,
                start_date=vote_date.strftime('%Y-%m-%d')
                if vote_date else None,
                classification='passage',
                result='pass' if passed else 'fail',
                chamber=chamber,
                bill=bill,
            )
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('not voting', not_voting_count)
            vote.add_source(link)

            seen = set()
            for a in ayes:
                if a in seen:
                    continue
                vote.yes(a)
                seen.add(a)
            for n in nos:
                if n in seen:
                    continue
                vote.no(n)
                seen.add(n)
            for n in not_voting:
                if n in seen:
                    continue
                vote.vote('not voting', n)
                seen.add(n)

            yield vote
Ejemplo n.º 32
0
    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h4[contains(@id, 'hdVote')])")

        if 'No Bill Action' in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(', ')[1]

        if location.startswith('House'):
            chamber = 'lower'
        elif location.startswith('Senate'):
            chamber = 'upper'
        elif location.startswith('Joint'):
            chamber = 'joint'
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        # committee = ' '.join(location.split(' ')[1:]).strip()
        # if not committee or committee.startswith('of Representatives'):
        #     committee = None

        motion = ', '.join(header.split(', ')[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//td[contains(@id, 'tdAyes')])"))
            no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//td[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//td[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification=type,
                             bill=bill)

            vote.add_source(url)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', excused_count)
            vote.set_count('absent', absent_count)

            for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"):
                if td.text in ('Aye', 'Yea'):
                    vote.yes(td.getprevious().text.strip())
                elif td.text == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif td.text == 'Excused':
                    vote.vote('excused', td.getprevious().text.strip())
                elif td.text == 'Absent':
                    vote.vote('absent', td.getprevious().text.strip())

            yield vote
Ejemplo n.º 33
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' '))

        seen_rcs = set()

        re_ns = "http://exslt.org/regular-expressions"
        path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={'re': re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if 'HOUSE' in header.xpath("string()"):
                chamber = 'lower'
                motion_index = 8
            else:
                chamber = 'upper'
                motion_index = 13

            motion = header.xpath("string(following-sibling::p[%d])" %
                                  motion_index).strip()
            motion = re.sub(r'\s+', ' ', motion)
            if not motion.strip():
                self.warning("Motion text not found")
                return
            match = re.match(r'^(.*) (PASSED|FAILED)$', motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == 'PASSED'
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ')
            rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1)

            if rcs in seen_rcs:
                continue
            else:
                seen_rcs.add(rcs)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r'\d+/\d+/\d+', date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace('\r\n', ' ').strip()
                if "*****" in line:
                    break
                regex = (r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL '
                         r'PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)')
                match = re.match(regex, line)
                if match:
                    if match.group(1) == 'YEAS' and 'RCS#' not in line:
                        vtype = 'yes'
                        seen_yes = True
                    elif match.group(1) == 'NAYS' and seen_yes:
                        vtype = 'no'
                    elif match.group(1) == 'VACANT':
                        continue  # skip these
                    elif seen_yes:
                        vtype = 'other'
                    if seen_yes and match.group(3).strip():
                        self.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split('   '):
                        if not name:
                            continue
                        if 'HOUSE' in name or 'SENATE ' in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts['yes'] > (counts['no'] + counts['other'])

            vote = Vote(chamber=chamber,
                        start_date=date.strftime('%Y-%m-%d'),
                        motion_text=motion,
                        result='pass' if passed else 'fail',
                        bill=bill,
                        classification='passage')
            vote.set_count('yes', counts['yes'])
            vote.set_count('no', counts['no'])
            vote.set_count('other', counts['other'])
            vote.pupa_id = url + '#' + rcs

            vote.add_source(url)

            for name in votes['yes']:
                vote.yes(name)
            for name in votes['no']:
                if ':' in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes['other']:
                vote.vote('other', name)

            yield vote
Ejemplo n.º 34
0
    def handle_page(self):
        (_, motion) = self.lines[5].split("FINAL ACTION:")
        motion = motion.strip()
        if not motion:
            self.scraper.warning("Vote appears to be empty")
            return

        vote_top_row = [
            self.lines.index(x) for x in self.lines
            if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x)
        ][0]
        yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea")
        nay_columns_begin = self.lines[vote_top_row].index("Nay")

        votes = {'yes': [], 'no': [], 'other': []}
        for line in self.lines[(vote_top_row + 1):]:
            if line.strip():
                member = re.search(
                    r'''(?x)
                        ^\s+(?:[A-Z\-]+)?\s+    # Possible vote indicator
                        ([A-Z][a-z]+            # Name must have lower-case characters
                        [\w\-\s]+)              # Continue looking for the rest of the name
                        (?:,[A-Z\s]+?)?         # Leadership has an all-caps title
                        (?:\s{2,}.*)?           # Name ends when many spaces are seen
                        ''', line).group(1)
                # sometimes members have trailing X's from other motions in the
                # vote sheet we aren't collecting
                member = re.sub('(\s+X)+', '', member)
                # Usually non-voting members won't even have a code listed
                # Only a couple of codes indicate an actual vote:
                # "VA" (vote after roll call) and "VC" (vote change)
                did_vote = bool(re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line))
                if did_vote:
                    # Check where the "X" or vote code is on the page
                    vote_column = len(line) - len(line.lstrip())
                    if vote_column <= yea_columns_end:
                        votes['yes'].append(member)
                    elif vote_column >= nay_columns_begin:
                        votes['no'].append(member)
                    else:
                        raise ValueError(
                            "Unparseable vote found for {0} in {1}:\n{2}".
                            format(member, self.url, line))
                else:
                    votes['other'].append(member)

            # End loop as soon as no more members are found
            else:
                break

        totals = re.search(r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS',
                           self.text).groups()
        yes_count = int(totals[0])
        no_count = int(totals[1])
        result = 'pass' if (yes_count > no_count) else 'fail'

        vote = VoteEvent(start_date=self.kwargs['date'],
                         bill=self.kwargs['bill'],
                         chamber='upper',
                         motion_text=motion,
                         classification='committee',
                         result=result)
        vote.add_source(self.url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', len(votes['other']))

        # set voters
        for vtype, voters in votes.items():
            for voter in voters:
                vote.vote(vtype, voter)

        yield vote
Ejemplo n.º 35
0
    def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
        try:
            page = self.get(url).text
        except scrapelib.HTTPError:
            self.warning("A vote page not found for bill {}".format(
                bill.identifier))
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        descr = page.xpath("//b")[0].text_content()
        if descr == '':
            # New page method
            descr = page.xpath("//div[@id='content']/center")[0].text

        if "on voice vote" in descr:
            return

        if "committee" in descr.lower():
            yield from self.scrape_committee_vote(bill, actor, date, motion,
                                                  page, url, uniqid)
            return

        passed = None
        if "Passed" in descr:
            passed = True
        elif "Failed" in descr:
            passed = False
        elif "UTAH STATE LEGISLATURE" in descr:
            return
        elif descr.strip() == '-':
            return
        else:
            self.warning(descr)
            raise NotImplementedError("Can't see if we passed or failed")

        headings = page.xpath("//b")[1:]
        votes = page.xpath("//table")
        sets = zip(headings, votes)
        vdict = {}
        for (typ, votes) in sets:
            txt = typ.text_content()
            arr = [x.strip() for x in txt.split("-", 1)]
            if len(arr) != 2:
                continue
            v_txt, count = arr
            v_txt = v_txt.strip()
            count = int(count)
            people = [
                x.text_content().strip()
                for x in votes.xpath(".//font[@face='Arial']")
            ]

            vdict[v_txt] = {"count": count, "people": people}

        vote = Vote(chamber=actor,
                    start_date=date,
                    motion_text=motion,
                    result='pass' if passed else 'fail',
                    bill=bill,
                    classification='passage',
                    identifier=str(uniqid))
        vote.set_count('yes', vdict['Yeas']['count'])
        vote.set_count('no', vdict['Nays']['count'])
        vote.set_count('other', vdict['Absent or not voting']['count'])
        vote.add_source(url)

        for person in vdict['Yeas']['people']:
            vote.yes(person)
        for person in vdict['Nays']['people']:
            vote.no(person)
        for person in vdict['Absent or not voting']['people']:
            vote.vote('other', person)

        yield vote
Ejemplo n.º 36
0
    def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date,
                    action_text):
        url = ('http://alisondb.legislature.state.al.us/Alison/'
               'GetRollCallVoteResults.aspx?'
               'VOTE={0}&BODY={1}&INST={2}&SESS={3}'.
               format(vote_id, vote_chamber, bill_id, self.session_id))
        doc = lxml.html.fromstring(self.get(url=url).text)

        voters = {'Y': [], 'N': [], 'P': [], 'A': []}

        voters_and_votes = doc.xpath('//table/tr/td/font/text()')
        capture_vote = False
        name = ''
        for item in voters_and_votes:
            if capture_vote:
                capture_vote = False
                if name:
                    voters[item].append(name)
            else:
                capture_vote = True
                name = item
                if (name.endswith(", Vacant") or
                        name.startswith("Total ") or
                        not name.strip()):
                    name = ''

        # Check name counts against totals listed on the site
        total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()')
        if total_yea:
            total_yea = int(total_yea[0].split(":")[-1])
            assert total_yea == len(voters['Y']), "Yea count incorrect"
        else:
            total_yea = len(voters['Y'])

        total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()')
        if total_nay:
            total_nay = int(total_nay[0].split(":")[-1])
            assert total_nay == len(voters['N']), "Nay count incorrect"
        else:
            total_nay = len(voters['N'])

        total_absent = doc.xpath(
            '//*[starts-with(text(), "Total Absent")]/text()')
        if total_absent:
            total_absent = int(total_absent[0].split(":")[-1])
            assert total_absent == len(voters['A']), "Absent count incorrect"
        total_other = len(voters['P']) + len(voters['A'])

        vote = VoteEvent(
            chamber=self.CHAMBERS[vote_chamber[0]],
            start_date=vote_date,
            motion_text=action_text,
            result='pass' if total_yea > total_nay else 'fail',
            classification='passage',
            bill=bill,
        )
        vote.set_count('yes', total_yea)
        vote.set_count('no', total_nay)
        vote.set_count('other', total_other)
        vote.add_source(url)
        for member in voters['Y']:
            vote.vote('yes', member)
        for member in voters['N']:
            vote.vote('no', member)
        for member in (voters['A'] + voters['P']):
            vote.vote('other', member)

        yield vote
Ejemplo n.º 37
0
    def scrape_votes_for_chamber(self, chamber, vote_data, bill, link):
        raw_vote_data = re.split(r"\w+? by [\w ]+?\s+-", vote_data.strip())[1:]

        motion_count = 1

        for raw_vote in raw_vote_data:
            raw_vote = raw_vote.split(u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0")
            motion = raw_vote[0]

            vote_date = re.search(r"(\d+/\d+/\d+)", motion)
            if vote_date:
                vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y")

            passed = (
                "Passed" in motion
                or "Recommended for passage" in motion
                or "Rec. for pass" in motion
                or "Adopted" in raw_vote[1]
            )
            vote_regex = re.compile(r"\d+$")
            aye_regex = re.compile(r"^.+voting aye were: (.+) -")
            no_regex = re.compile(r"^.+voting no were: (.+) -")
            not_voting_regex = re.compile(r"^.+present and not voting were: (.+) -")
            yes_count = 0
            no_count = 0
            not_voting_count = 0
            ayes = []
            nos = []
            not_voting = []

            for v in raw_vote[1:]:
                v = v.strip()
                if v.startswith("Ayes...") and vote_regex.search(v):
                    yes_count = int(vote_regex.search(v).group())
                elif v.startswith("Noes...") and vote_regex.search(v):
                    no_count = int(vote_regex.search(v).group())
                elif v.startswith("Present and not voting...") and vote_regex.search(v):
                    not_voting_count += int(vote_regex.search(v).group())
                elif aye_regex.search(v):
                    ayes = aye_regex.search(v).groups()[0].split(", ")
                elif no_regex.search(v):
                    nos = no_regex.search(v).groups()[0].split(", ")
                elif not_voting_regex.search(v):
                    not_voting += not_voting_regex.search(v).groups()[0].split(", ")

            motion = motion.strip()
            motion = motion.replace("&AMP;", "&")  # un-escape ampersands
            if motion in self._seen_votes:
                motion = "{} ({})".format(motion, motion_count)
                motion_count += 1
            self._seen_votes.add(motion)

            vote = VoteEvent(
                motion_text=motion,
                start_date=vote_date.strftime("%Y-%m-%d") if vote_date else None,
                classification="passage",
                result="pass" if passed else "fail",
                chamber=chamber,
                bill=bill,
            )
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("not voting", not_voting_count)
            vote.add_source(link)

            seen = set()
            for a in ayes:
                if a in seen:
                    continue
                vote.yes(a)
                seen.add(a)
            for n in nos:
                if n in seen:
                    continue
                vote.no(n)
                seen.add(n)
            for n in not_voting:
                if n in seen:
                    continue
                vote.vote("not voting", n)
                seen.add(n)

            yield vote
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for current year
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.get(url).text
        # if first page isn't found, try second year
        if ('Page Not Found' in html or
                'The bill you are looking for is not available yet' in html):
            html = self.get('http://legislature.mi.gov/doc.aspx?%s-%s' %
                            (session[-4:], bill_id.replace(' ', '-'))).text
            if ('Page Not Found' in html
                    or 'The bill you are looking for is not available yet'
                    in html):
                return

        doc = lxml.html.fromstring(html)

        title = doc.xpath(
            '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(bill_id,
                    session,
                    title,
                    chamber=chamber,
                    classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u'\xa0', ' ')
            if len(sponsors) > 1:
                classification = ('primary'
                                  if sponsor.tail and 'primary' in sponsor.tail
                                  else 'cosponsor')
            else:
                classification = 'primary'
            bill.add_sponsorship(
                name=name,
                chamber=chamber,
                entity_type='person',
                primary=classification == 'primary',
                classification=classification,
            )

        bill.subject = doc.xpath(
            '//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath(
                '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = TIMEZONE.localize(
                datetime.datetime.strptime(date, "%m/%d/%Y"))
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            classification = categorize_action(action)
            bill.add_action(action,
                            date,
                            chamber=actor,
                            classification=classification)

            # check if action mentions a vote
            rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    results = self.parse_roll_call(vote_url, rc_num)
                    vote = VoteEvent(
                        start_date=date,
                        chamber=actor,
                        bill=bill,
                        motion_text=action,
                        result='pass' if
                        len(results['yes']) > len(results['no']) else 'fail',
                        classification='passage',
                    )

                    # check the expected counts vs actual
                    count = re.search('YEAS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(results['yes']):
                        self.warning(
                            'vote count mismatch for %s %s, %d != %d' %
                            (bill_id, action, count, len(results['yes'])))
                    count = re.search('NAYS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(results['no']):
                        self.warning(
                            'vote count mismatch for %s %s, %d != %d' %
                            (bill_id, action, count, len(results['no'])))

                    vote.set_count('yes', len(results['yes']))
                    vote.set_count('no', len(results['no']))
                    vote.set_count('other', len(results['other']))

                    for name in results['yes']:
                        vote.yes(name)
                    for name in results['no']:
                        vote.no(name)
                    for name in results['other']:
                        vote.vote('other', name)

                    vote.add_source(vote_url)
                    yield vote
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath(
                '//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif url.endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
Ejemplo n.º 39
0
    def scrape(self, window=28) :
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for matter in self.matters(n_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Board of Directors"})
            
            legistar_web = matter['legistar_url']
            
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(body_name,
                                           'organization',
                                           entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill. 
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(identifier=identifier,
                                          legislative_session=related_bill_session,
                                          relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link('Board Report',
                                  'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id),
                                   media_type="application/pdf")

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_document_link(attachment['MatterAttachmentName'],
                                           attachment['MatterAttachmentHyperlink'],
                                           media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Ejemplo n.º 40
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, 'text')
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)
        date = None

        for idx, line in enumerate(lines):
            line = line.rstrip().decode('utf-8')
            match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)',
                line)
            if match:
                motion = (lines[idx - 2].strip()).decode('utf-8')
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()]

                exc_match = re.search(r'EXCUSED: (\d+)', line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith('ADOPTED') or line.endswith('PASSED'):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$',
                line)
            if match:
                vote_type = {'YEAS': 'yes',
                             'NAYS': 'no',
                             'NOT VOTING': 'other',
                             'EXCUSED': 'other',
                             'PAIRED': 'paired'}[match.group(1)]
                continue

            if vote_type == 'paired':
                for part in line.split('   '):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(
                        r'([^\(]+)\((YEA|NAY)\)', line).groups()
                    name = name.strip()
                    if pair_type == 'YEA':
                        votes['yes'].append(name)
                    elif pair_type == 'NAY':
                        votes['no'].append(name)
            elif vote_type:
                for name in line.split('   '):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)
        if date:
            vote = Vote(chamber='lower',
                        start_date=date.strftime("%Y-%m-%d"),
                        motion_text=motion,
                        result='pass' if passed else 'fail',
                        classification='passage',
                        bill=bill)

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('other', other_count)
            vote.add_source(url)

            for key, values in votes.items():
                for value in values:
                    vote.vote(key, value)

            yield vote
        else:
            self.warning("Syntax Error/Warning using 'convert_pdf'")
Ejemplo n.º 41
0
    def scrape_journal(self, url, chamber, session, date):

        filename, response = self.urlretrieve(url)
        self.logger.info('Saved journal to %r' % filename)
        all_text = convert_pdf(filename, type="text")

        lines = all_text.split(b'\n')
        lines = [line.decode('utf-8') for line in lines]
        lines = [line.
                 strip().
                 replace('–', '-').
                 replace('―', '"').
                 replace('‖', '"').
                 replace('“', '"').
                 replace('”', '"')
                 for line in lines]

        # Do not process headers or completely empty lines
        header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+"
        header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day"
        lines = iter([line for line in lines if not(
                     line == "" or
                     re.match(header_date_re, line) or
                     re.match(header_journal_re, line))])

        for line in lines:
            # Go through with vote parse if any of
            # these conditions match.
            if not line.startswith("On the question") or \
                    "shall" not in line.lower():
                continue

            # Get the bill_id
            bill_id = None
            bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)'

            # The Senate ends its motion text with a vote announcement
            if chamber == "upper":
                end_of_motion_re = r'.* the vote was:\s*'
            # The House may or may not end motion text with a bill name
            elif chamber == "lower":
                end_of_motion_re = r'.*Shall.*\?"?(\s{})?\s*'.format(bill_re)

            while not re.match(end_of_motion_re, line, re.IGNORECASE):
                line += " " + next(lines)

            try:
                bill_id = re.search(bill_re, line).group(1)
            except AttributeError:
                self.warning("This motion did not pertain to legislation: {}".
                             format(line))
                continue

            # Get the motion text
            motion_re = r'''
                    ^On\sthe\squestion\s  # Precedes any motion
                    "+  # Motion is preceded by a quote mark (or two)
                    (Shall\s.+?\??)  # The motion text begins with "Shall"
                    \s*"\s+  # Motion is followed by a quote mark
                    (?:{})?  # If the vote regards a bill, its number is listed
                    {}  # Senate has trailing text
                    \s*$
                    '''.format(
                    bill_re,
                    r',?.*?the\svote\swas:' if chamber == 'upper' else ''
                    )
            print(line)
            motion = re.search(motion_re,
                               line,
                               re.VERBOSE | re.IGNORECASE).group(1)

            for word, letter in (('Senate', 'S'),
                                 ('House', 'H'),
                                 ('File', 'F')):

                if bill_id is None:
                    return

                bill_id = bill_id.replace(word, letter)

            bill_id = bill_id.replace('.', '')

            bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]]
            self.current_id = bill_id
            votes, passed = self.parse_votes(lines)

            # at the very least, there should be a majority
            # for the bill to have passed, so check that,
            # but if the bill didn't pass, it could still be OK if it got a majority
            # eg constitutional amendments
            if not ((passed == (votes['yes_count'] > votes['no_count'])) or (not passed)):
                self.error("The bill passed without a majority?")
                raise ValueError('invalid vote')

            # also throw a warning if the bill failed but got a majority
            # it could be OK, but is probably something we'd want to check
            if not passed and votes['yes_count'] > votes['no_count']:
                self.logger.warning("The bill got a majority but did not pass. "
                                    "Could be worth confirming.")

            result = ""
            if passed:
                result = "pass"
            else:
                result = "fail"

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=re.sub('\xad', '-', motion),
                             result=result,
                             classification='passage',
                             legislative_session=session,
                             bill=bill_id,
                             bill_chamber=bill_chamber
                             )

            # add votes and counts
            for vtype in ('yes', 'no', 'absent', 'abstain'):
                vcount = votes['{}_count'.format(vtype)] or 0
                vote.set_count(vtype, vcount)
                for voter in votes['{}_votes'.format(vtype)]:
                    vote.vote(vtype, voter)

            vote.add_source(url)
            yield vote
Ejemplo n.º 42
0
    def scrape_journal(self, url, chamber, session, date):

        filename, response = self.urlretrieve(url)
        self.logger.info('Saved journal to %r' % filename)
        all_text = convert_pdf(filename, type="text")

        lines = all_text.split(b'\n')
        lines = [line.decode('utf-8') for line in lines]
        lines = [
            line.strip().replace('–', '-').replace('―', '"').replace(
                '‖', '"').replace('“', '"').replace('”', '"') for line in lines
        ]

        # Do not process headers or completely empty lines
        header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+"
        header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day"
        lines = iter([
            line for line in lines
            if not (line == "" or re.match(header_date_re, line)
                    or re.match(header_journal_re, line))
        ])

        for line in lines:
            # Go through with vote parse if any of
            # these conditions match.
            if not line.startswith("On the question") or \
                    "shall" not in line.lower():
                continue

            # Get the bill_id
            bill_id = None
            bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)'

            # The Senate ends its motion text with a vote announcement
            if chamber == "upper":
                end_of_motion_re = r'.* the vote was:\s*'
            # The House may or may not end motion text with a bill name
            elif chamber == "lower":
                end_of_motion_re = r'.*Shall.*(?:\?"?|")(\s{})?\s*'.format(
                    bill_re)

            while not re.match(end_of_motion_re, line, re.IGNORECASE):
                line += " " + next(lines)

            try:
                bill_id = re.search(bill_re, line).group(1)
            except AttributeError:
                self.warning(
                    "This motion did not pertain to legislation: {}".format(
                        line))
                continue

            # Get the motion text
            motion_re = r'''
                    ^On\sthe\squestion\s  # Precedes any motion
                    "+  # Motion is preceded by a quote mark (or two)
                    (Shall\s.+?\??)  # The motion text begins with "Shall"
                    \s*(?:\?"?|"|’)\s+  # Motion is followed by a question mark and/or a quote mark
                    (?:{})?  # If the vote regards a bill, its number is listed
                    {}  # Senate has trailing text
                    \s*$
                    '''.format(
                bill_re, r',?.*?the\svote\swas:' if chamber == 'upper' else '')
            print(line)
            motion = re.search(motion_re, line,
                               re.VERBOSE | re.IGNORECASE).group(1)

            for word, letter in (('Senate', 'S'), ('House', 'H'), ('File',
                                                                   'F')):

                if bill_id is None:
                    return

                bill_id = bill_id.replace(word, letter)

            bill_id = bill_id.replace('.', '')

            bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]]
            votes, passed = self.parse_votes(lines)

            # at the very least, there should be a majority
            # for the bill to have passed, so check that,
            # but if the bill didn't pass, it could still be OK if it got a majority
            # eg constitutional amendments
            if not ((passed == (votes['yes_count'] > votes['no_count'])) or
                    (not passed)):
                self.error("The bill passed without a majority?")
                raise ValueError('invalid vote')

            # also throw a warning if the bill failed but got a majority
            # it could be OK, but is probably something we'd want to check
            if not passed and votes['yes_count'] > votes['no_count']:
                self.logger.warning(
                    "The bill got a majority but did not pass. "
                    "Could be worth confirming.")

            result = ""
            if passed:
                result = "pass"
            else:
                result = "fail"

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=re.sub('\xad', '-', motion),
                             result=result,
                             classification='passage',
                             legislative_session=session,
                             bill=bill_id,
                             bill_chamber=bill_chamber)

            # add votes and counts
            for vtype in ('yes', 'no', 'absent', 'abstain'):
                vcount = votes['{}_count'.format(vtype)] or 0
                vote.set_count(vtype, vcount)
                for voter in votes['{}_votes'.format(vtype)]:
                    vote.vote(vtype, voter)

            vote.add_source(url)
            yield vote
Ejemplo n.º 43
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower') or
                    (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id, source_url, media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime(
                    '%m/%d/%y')
                version_name = "{} - {}".format(
                    version_date_human, version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type='application/pdf',
                    date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    def replacer(matchobj):
                        if matchobj:
                            return {'Assembly': 'lower',
                                    'Senate': 'upper'}[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on', action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor,
                                           classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(
                        committee, entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = (
                    'http://leginfo.legislature.ca.gov/faces'
                    '/billVotesClient.xhtml?bill_id={}'
                ).format(fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
Ejemplo n.º 44
0
    def scrape_vote(self, bill, name, url):
        if "VOTE/h" in url:
            vote_chamber = 'lower'
            cols = (1, 5, 9, 13)
            name_offset = 3
            yes_offset = 0
            no_offset = 1
        else:
            vote_chamber = 'upper'
            cols = (1, 6)
            name_offset = 4
            yes_offset = 1
            no_offset = 2

        page = self.get(url, verify=False).text

        if 'BUDGET ADDRESS' in page:
            return

        page = lxml.html.fromstring(page)

        yes_count = page.xpath(
            "string(//span[contains(., 'Those voting Yea')])")
        yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1))

        no_count = page.xpath(
            "string(//span[contains(., 'Those voting Nay')])")
        no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1))

        other_count = page.xpath(
            "string(//span[contains(., 'Those absent')])")
        other_count = int(
            re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1))

        need_count = page.xpath(
            "string(//span[contains(., 'Necessary for')])")
        need_count = int(
            re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1))

        date = page.xpath("string(//span[contains(., 'Taken on')])")
        date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1)
        date = date.replace(' ', '')
        date = datetime.datetime.strptime(date + " " + bill.legislative_session,
                                          "%m/%d %Y").date()

        # not sure about classification.
        vote = Vote(chamber=vote_chamber,
                    start_date=date,
                    motion_text=name,
                    result='pass' if yes_count > need_count else 'fail',
                    classification='passage',
                    bill=bill
                    )
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)
        vote.add_source(url)
        table = page.xpath("//table")[0]
        for row in table.xpath("tr"):
            for i in cols:
                name = row.xpath("string(td[%d])" % (
                    i + name_offset)).strip()

                if not name or name == 'VACANT':
                    continue

                if "Y" in row.xpath("string(td[%d])" %
                                    (i + yes_offset)):
                    vote.yes(name)
                elif "N" in row.xpath("string(td[%d])" %
                                      (i + no_offset)):
                    vote.no(name)
                else:
                    vote.vote('other', name)

        yield vote
Ejemplo n.º 45
0
    def _parse_votes(self, url, vote, bill):
        '''Given a vote url and a vote object, extract the voters and
        the vote counts from the vote page and update the vote object.
        '''
        if url.lower().endswith('.pdf'):

            try:
                resp = self.get(url)
            except HTTPError:
                # This vote document wasn't found.
                msg = 'No document found at url %r' % url
                self.logger.warning(msg)
                return

            try:
                v = PDFCommitteeVote(url, resp.content, bill)
                return v.asvote()
            except PDFCommitteeVoteParseError:
                # Warn and skip.
                self.warning("Could't parse committee vote at %r" % url)
                return

        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # Yes, no, excused, absent.
        try:
            vals = doc.xpath('//table')[1].xpath('tr/td/text()')
        except IndexError:
            # Most likely was a bogus link lacking vote data.
            return

        yes_count, no_count, excused_count, absent_count = map(int, vals)

        # Get the motion.
        try:
            motion = doc.xpath('//br')[-1].tail.strip()
        except IndexError:
            # Some of them mysteriously have no motion listed.
            motion = vote['action']

        if not motion:
            motion = vote['action']

        vote['motion'] = motion

        action = vote['action']
        vote_url = vote['vote_url']

        vote = VoteEvent(
            chamber=vote['chamber'],
            start_date=vote['date'],
            motion_text=vote['motion'],
            result='fail',      # placeholder
            classification='passage',
            bill=bill,
            bill_action=vote['action'],
        )
        vote.pupa_id = vote_url         # URL contains sequence number
        vote.add_source(vote_url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('excused', excused_count)
        vote.set_count('absent', absent_count)

        for text in doc.xpath('//table')[2].xpath('tr/td/text()'):
            if not text.strip(u'\xa0'):
                continue
            v, name = filter(None, text.split(u'\xa0'))
            # Considering Name is brackets as short name
            regex = re.compile(r".*?\((.*?)\)")
            short_name = re.findall(regex, name)
            if len(short_name) > 0:
                note = 'Short Name: ' + short_name[0]
            else:
                note = ''
            # Name without brackets like 'Kary, Douglas'
            name = re.sub(r"[\(\[].*?[\)\]]", "", name)
            if v == 'Y':
                vote.yes(name, note=note)
            elif v == 'N':
                vote.no(name, note=note)
            elif v == 'E':
                vote.vote('excused', name, note=note)
            elif v == 'A':
                vote.vote('absent', name, note=note)

        # code to deterimine value of `passed`
        passed = None

        # some actions take a super majority, so we aren't just
        # comparing the yeas and nays here.
        for i in vote_passage_indicators:
            if i in action:
                passed = True
                break
        for i in vote_failure_indicators:
            if i in action and passed:
                # a quick explanation:  originally an exception was
                # thrown if both passage and failure indicators were
                # present because I thought that would be a bug in my
                # lists.  Then I found 2007 HB 160.
                # Now passed = False if the nays outnumber the yays..
                # I won't automatically mark it as passed if the yays
                # ounumber the nays because I don't know what requires
                # a supermajority in MT.
                if no_count >= yes_count:
                    passed = False
                    break
                else:
                    raise Exception("passage and failure indicator"
                                    "both present at: %s" % url)
            if i in action and passed is None:
                passed = False
                break
        for i in vote_ambiguous_indicators:
            if i in action:
                passed = yes_count > no_count
                break
        if passed is None:
            raise Exception("Unknown passage at: %s" % url)

        vote.result = 'pass' if passed else 'fail'

        return vote
    def _parse_votes(self, url, vote, bill):
        '''Given a vote url and a vote object, extract the voters and
        the vote counts from the vote page and update the vote object.
        '''
        if url.lower().endswith('.pdf'):

            try:
                resp = self.get(url)
            except HTTPError:
                # This vote document wasn't found.
                msg = 'No document found at url %r' % url
                self.logger.warning(msg)
                return

            try:
                v = PDFCommitteeVote(url, resp.content, bill)
                return v.asvote()
            except PDFCommitteeVoteParseError:
                # Warn and skip.
                self.warning("Could't parse committee vote at %r" % url)
                return

        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        # Yes, no, excused, absent.
        try:
            vals = doc.xpath('//table')[1].xpath('tr/td/text()')
        except IndexError:
            # Most likely was a bogus link lacking vote data.
            return

        yes_count, no_count, excused_count, absent_count = map(int, vals)

        # Get the motion.
        try:
            motion = doc.xpath('//br')[-1].tail.strip()
        except:
            # Some of them mysteriously have no motion listed.
            motion = vote['action']

        if not motion:
            motion = vote['action']

        vote['motion'] = motion

        action = vote['action']
        vote_url = vote['vote_url']

        vote = VoteEvent(
            chamber=vote['chamber'],
            start_date=vote['date'],
            motion_text=vote['motion'],
            result='fail',  # placeholder
            classification='passage',
            bill=bill,
            bill_action=vote['action'],
        )
        vote.pupa_id = vote_url  # URL contains sequence number
        vote.add_source(vote_url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('excused', excused_count)
        vote.set_count('absent', absent_count)

        for text in doc.xpath('//table')[2].xpath('tr/td/text()'):
            if not text.strip(u'\xa0'):
                continue
            v, name = filter(None, text.split(u'\xa0'))
            # Considering Name is brackets as short name
            regex = re.compile(".*?\((.*?)\)")
            short_name = re.findall(regex, name)
            if len(short_name) > 0:
                note = 'Short Name: ' + short_name[0]
            else:
                note = ''
            # Name without brackets like 'Kary, Douglas'
            name = re.sub("[\(\[].*?[\)\]]", "", name)
            if v == 'Y':
                vote.yes(name, note=note)
            elif v == 'N':
                vote.no(name, note=note)
            elif v == 'E':
                vote.vote('excused', name, note=note)
            elif v == 'A':
                vote.vote('absent', name, note=note)

        # code to deterimine value of `passed`
        passed = None

        # some actions take a super majority, so we aren't just
        # comparing the yeas and nays here.
        for i in vote_passage_indicators:
            if i in action:
                passed = True
                break
        for i in vote_failure_indicators:
            if i in action and passed:
                # a quick explanation:  originally an exception was
                # thrown if both passage and failure indicators were
                # present because I thought that would be a bug in my
                # lists.  Then I found 2007 HB 160.
                # Now passed = False if the nays outnumber the yays..
                # I won't automatically mark it as passed if the yays
                # ounumber the nays because I don't know what requires
                # a supermajority in MT.
                if no_count >= yes_count:
                    passed = False
                    break
                else:
                    raise Exception("passage and failure indicator"
                                    "both present at: %s" % url)
            if i in action and passed is None:
                passed = False
                break
        for i in vote_ambiguous_indicators:
            if i in action:
                passed = yes_count > no_count
                break
        if passed is None:
            raise Exception("Unknown passage at: %s" % url)

        vote.result = 'pass' if passed else 'fail'

        return vote
Ejemplo n.º 47
0
    def scrape(self):
        for leg_summary in self.legislation(
                created_after=datetime.datetime(2014, 1, 1)):
            leg_type = BILL_TYPES[leg_summary['Type']]

            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name": "New York City Council"})
            bill.add_source(leg_summary['url'], note='web')

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'],
                           note='created by administrative staff')

            if 'Summary' in leg_details:
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number']:
                bill.add_identifier(leg_details['Law number'],
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])):
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor, sponsorship_type, 'person',
                                     primary)

            for attachment in leg_details.get('Attachments', []):
                if attachment['label']:
                    bill.add_document_link(attachment['label'],
                                           attachment['url'],
                                           media_type="application/pdf")

            history = list(history)

            if history:
                earliest_action = min(
                    self.toTime(action['Date']) for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else:
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history:
                action_description = action['Action']
                if not action_description:
                    continue

                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council':
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration':
                    responsible_org = 'Mayor'

                if responsible_org == 'Town Hall Meeting':
                    continue
                else:
                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=action_class)

                if 'url' in action['Action\xa0Details']:
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral':
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details[
                            'Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(
                            referred_committee,
                            'organization',
                            entity_id=_make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if result and votes:
                        action_vote = VoteEvent(
                            legislative_session=bill.legislative_session,
                            motion_text=action_description,
                            organization={'name': responsible_org},
                            classification=action_class,
                            start_date=action_date,
                            result=result,
                            bill=bill)
                        action_vote.add_source(action_detail_url, note='web')

                        for option, voter in votes:
                            action_vote.vote(option, voter)

                        yield action_vote

            text = self.text(leg_summary['url'])

            if text:
                bill.extras = {
                    'local_classification': leg_summary['Type'],
                    'full_text': text
                }
            else:
                bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
Ejemplo n.º 48
0
    def scrape_pdf_for_votes(self, session, actor, date, motion, href):
        warned = False
        # vote indicator, a few spaces, a name, newline or multiple spaces
        # VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})')
        COUNT_RE = re.compile(
            r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$'
        )
        PASS_FAIL_WORDS = {
            'PASSED': 'pass',
            'PREVAILED': 'fail',
            'ADOPTED': 'pass',
            'CONCURRED': 'pass',
            'FAILED': 'fail',
            'LOST': 'fail',
        }

        pdflines = self.fetch_pdf_lines(href)

        if not pdflines:
            return False

        yes_count = no_count = present_count = 0
        yes_votes = []
        no_votes = []
        present_votes = []
        excused_votes = []
        not_voting = []
        absent_votes = []
        passed = None
        counts_found = False
        vote_lines = []
        for line in pdflines:
            # consider pass/fail as a document property instead of a result of the vote count
            # extract the vote count from the document instead of just using counts of names
            if not line.strip():
                continue
            elif line.strip() in PASS_FAIL_WORDS:
                # Crash on duplicate pass/fail status that differs from previous status
                if passed is not None and passed != PASS_FAIL_WORDS[
                        line.strip()]:
                    raise Exception("Duplicate pass/fail matches in [%s]" %
                                    href)
                passed = PASS_FAIL_WORDS[line.strip()]
            elif COUNT_RE.match(line):
                (yes_count, no_count, present_count,
                 not_voting_count) = COUNT_RE.match(line).groups()
                yes_count = int(yes_count)
                no_count = int(no_count)
                present_count = int(present_count)
                counts_found = True
            elif counts_found:
                for value in VOTE_VALUES:
                    if re.search(r'^\s*({})\s+\w'.format(value), line):
                        vote_lines.append(line)
                        break

        votes = find_columns_and_parse(vote_lines)
        for name, vcode in votes.items():
            if name == 'Mr. Speaker':
                name = session_details[session]['speaker']
            elif name == 'Mr. President':
                name = session_details[session]['president']
            else:
                # Converts "Davis,William" to "Davis, William".
                name = re.sub(r'\,([a-zA-Z])', r', \1', name)

            if vcode == 'Y':
                yes_votes.append(name)
            elif vcode == 'N':
                no_votes.append(name)
            elif vcode == 'P':
                present_votes.append(name)
            elif vcode == 'E':
                excused_votes.append(name)
            elif vcode == 'NV':
                not_voting.append(name)
            elif vcode == 'A':
                absent_votes.append(name)

        # fake the counts
        if yes_count == 0 and no_count == 0 and present_count == 0:
            yes_count = len(yes_votes)
            no_count = len(no_votes)
        else:  # audit
            if yes_count != len(yes_votes):
                self.warning("Mismatched yes count [expect: %i] [have: %i]" %
                             (yes_count, len(yes_votes)))
                warned = True
            if no_count != len(no_votes):
                self.warning("Mismatched no count [expect: %i] [have: %i]" %
                             (no_count, len(no_votes)))
                warned = True

        if passed is None:
            if actor[
                    'classification'] == 'lower':  # senate doesn't have these lines
                self.warning(
                    "No pass/fail word found; fall back to comparing yes and no vote."
                )
                warned = True
            passed = 'pass' if yes_count > no_count else 'fail'

        classification, _ = _categorize_action(motion)
        vote_event = VoteEvent(legislative_session=session,
                               motion_text=motion,
                               classification=classification,
                               organization=actor,
                               start_date=date,
                               result=passed)
        for name in yes_votes:
            vote_event.yes(name)
        for name in no_votes:
            vote_event.no(name)
        for name in present_votes:
            vote_event.vote('other', name)
        for name in excused_votes:
            vote_event.vote('excused', name)
        for name in not_voting:
            vote_event.vote('not voting', name)
        for name in absent_votes:
            vote_event.vote('absent', name)

        vote_event.set_count('yes', yes_count)
        vote_event.set_count('no', no_count)
        vote_event.set_count('other', present_count)
        vote_event.set_count('excused', len(excused_votes))
        vote_event.set_count('absent', len(absent_votes))
        vote_event.set_count('not voting', len(not_voting))

        vote_event.add_source(href)

        # for distinguishing between votes with the same id and on same day
        vote_event.pupa_id = href

        if warned:
            self.warning("Warnings were issued. Best to check %s" % href)
        return vote_event
Ejemplo n.º 49
0
    def scrape(self, session=None, chamber=None):
        bill_type_map = {
            "B": "bill",
            "R": "resolution",
            "JR": "joint resolution",
            "CR": "concurrent resolution",
        }

        chamber_map = {
            "H": "lower",
            "S": "upper",
            "J": "joint",
            "E": "legislature",  # Effective date
        }

        action_code_map = {
            "HI": None,
            "SI": None,
            "HH": None,
            "SH": None,
            "HPF": ["introduction"],
            "HDSAS": None,
            "SPF": ["introduction"],
            "HSR": ["reading-2"],
            "SSR": ["reading-2"],
            "HFR": ["reading-1"],
            "SFR": ["reading-1"],
            "HRECM": ["withdrawal", "referral-committee"],
            "SRECM": ["withdrawal", "referral-committee"],
            "SW&C": ["withdrawal", "referral-committee"],
            "HW&C": ["withdrawal", "referral-committee"],
            "HRA": ["passage"],
            "SRA": ["passage"],
            "HPA": ["passage"],
            "HRECO": None,
            "SPA": ["passage"],
            "HTABL": None,  # 'House Tabled' - what is this?
            "SDHAS": None,
            "HCFR": ["committee-passage-favorable"],
            "SCFR": ["committee-passage-favorable"],
            "HRAR": ["referral-committee"],
            "SRAR": ["referral-committee"],
            "STR": ["reading-3"],
            "SAHAS": None,
            "SE": ["passage"],
            "SR": ["referral-committee"],
            "HTRL": ["reading-3", "failure"],
            "HTR": ["reading-3"],
            "S3RLT": ["reading-3", "failure"],
            "HASAS": None,
            "S3RPP": None,
            "STAB": None,
            "SRECO": None,
            "SAPPT": None,
            "HCA": None,
            "HNOM": None,
            "HTT": None,
            "STT": None,
            "SRECP": None,
            "SCRA": None,
            "SNOM": None,
            "S2R": ["reading-2"],
            "H2R": ["reading-2"],
            "SENG": ["passage"],
            "HENG": ["passage"],
            "HPOST": None,
            "HCAP": None,
            "SDSG": ["executive-signature"],
            "SSG": ["executive-receipt"],
            "Signed Gov": ["executive-signature"],
            "HDSG": ["executive-signature"],
            "HSG": ["executive-receipt"],
            "EFF": None,
            "HRP": None,
            "STH": None,
            "HTS": None,
        }

        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        sid = SESSION_SITE_IDS[session]

        legislation = backoff(self.lservice.GetLegislationForSession,
                              sid)["LegislationIndex"]

        for leg in legislation:
            lid = leg["Id"]
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument["StatusHistory"][0]]

            actions = reversed([{
                "code": x["Code"],
                "action": x["Description"],
                "_guid": x["Id"],
                "date": x["Date"],
            } for x in history])

            guid = instrument["Id"]

            # A little bit hacky.
            bill_prefix = instrument["DocumentType"]
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = "%s %s" % (bill_prefix, instrument["Number"])
            if instrument["Suffix"]:
                bill_id += instrument["Suffix"]

            title = instrument["Caption"]
            description = instrument["Summary"]

            if title is None:
                continue

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=title,
                classification=bill_type,
            )
            bill.add_abstract(description, note="description")
            bill.extras = {"guid": guid}

            if instrument["Votes"]:
                for vote_ in instrument["Votes"]:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"])

                    vote = VoteEvent(
                        start_date=vote_["Date"].strftime("%Y-%m-%d"),
                        motion_text=vote_["Caption"] or "Vote on Bill",
                        chamber={
                            "House": "lower",
                            "Senate": "upper"
                        }[vote_["Branch"]],
                        result="pass"
                        if vote_["Yeas"] > vote_["Nays"] else "fail",
                        classification="passage",
                        bill=bill,
                    )
                    vote.set_count("yes", vote_["Yeas"])
                    vote.set_count("no", vote_["Nays"])
                    vote.set_count("other",
                                   vote_["Excused"] + vote_["NotVoting"])

                    vote.add_source(self.vsource)

                    methods = {"Yea": "yes", "Nay": "no"}

                    if vote_["Votes"] is not None:
                        for vdetail in vote_["Votes"][0]:
                            whom = vdetail["Member"]
                            how = vdetail["MemberVoted"]
                            if whom["Name"] == "VACANT":
                                continue
                            name, district = vote_name_pattern.search(
                                whom["Name"]).groups()
                            vote.vote(methods.get(how, "other"),
                                      name,
                                      note=district)

                    yield vote

            ccommittees = defaultdict(list)
            committees = instrument["Committees"]
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        "House": "lower",
                        "Senate": "upper"
                    }[committee["Type"]]].append(committee["Name"])

            for action in actions:
                action_chamber = chamber_map[action["code"][0]]

                try:
                    action_types = action_code_map[action["code"]]
                except KeyError:
                    error_msg = "Code {code} for action {action} not recognized.".format(
                        code=action["code"], action=action["action"])

                    self.logger.warning(error_msg)

                    action_types = None

                committees = []
                if action_types and any(
                    ("committee" in x for x in action_types)):
                    committees = [
                        str(x) for x in ccommittees.get(action_chamber, [])
                    ]

                act = bill.add_action(
                    action["action"],
                    action["date"].strftime("%Y-%m-%d"),
                    classification=action_types,
                    chamber=action_chamber,
                )
                for committee in committees:
                    act.add_related_entity(committee, "organization")
                act.extras = {"code": action["code"], "guid": action["_guid"]}

            sponsors = []
            if instrument["Authors"]:
                sponsors = instrument["Authors"]["Sponsorship"]
                if "Sponsors" in instrument and instrument["Sponsors"]:
                    sponsors += instrument["Sponsors"]["Sponsorship"]

            sponsors = [(x["Type"], self.get_member(x["MemberId"]))
                        for x in sponsors]

            for typ, sponsor in sponsors:
                name = "{First} {Last}".format(**dict(sponsor["Name"]))
                bill.add_sponsorship(
                    name,
                    entity_type="person",
                    classification="primary"
                    if "Author" in typ else "secondary",
                    primary="Author" in typ,
                )

            for version in instrument["Versions"]["DocumentDescription"]:
                name, url, doc_id, version_id = [
                    version[x]
                    for x in ["Description", "Url", "Id", "Version"]
                ]
                link = bill.add_version_link(name,
                                             url,
                                             media_type="application/pdf")
                link["extras"] = {
                    "_internal_document_id": doc_id,
                    "_version_id": version_id,
                }

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(
                SOURCE_URL.format(**{
                    "session": session,
                    "bid": guid
                }))

            yield bill
Ejemplo n.º 50
0
    def parse_vote(self, bill, link):
        # Server sometimes sends proper error headers,
        # sometimes not
        try:
            self.info("Get {}".format(link))
            text = requests.get(link).text
        except requests.exceptions.HTTPError as err:
            self.warning("{} fetching vote {}, skipping".format(err, link))
            return

        if 'Varnish cache server' in text:
            self.warning("Scrape rate is too high, try re-scraping with "
                         "The --rpm set to a lower number")
            return

        if 'Page Not Found' in text or 'Page Unavailable' in text:
            self.warning("missing vote, skipping")
            return
        member_doc = lxml.html.fromstring(text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        chamber_date_line = ''.join(
            member_doc.xpath("//div[@id='main_content']/h3[1]//text()"))
        chamber_date_line_words = chamber_date_line.split()
        vote_chamber = chamber_date_line_words[0]
        vote_date = datetime.datetime.strptime(chamber_date_line_words[-1],
                                               '%m/%d/%Y')
        vote_status = " ".join(chamber_date_line_words[2:-2])
        opinions = member_doc.xpath(
            "//div[@id='main_content']/h3[position() > 1]/text()")
        if len(opinions) > 0:
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except ValueError:
                    # This is likely not a vote-count text chunk
                    # It's probably '`On roll call the vote was:`
                    pass
                else:
                    if "yea" in i.lower():
                        yes_count = count
                    elif "nay" in i.lower():
                        no_count = count
                    elif "present" in i.lower():
                        p_count = count
                    elif "absent" in i.lower():
                        a_count = count

            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime('%Y-%m-%d'),
                chamber=vote_chamber,
                motion_text=vote_status,
                result='pass' if yes_count > no_count else 'fail',
                classification='passage',
            )
            vote.pupa_id = link

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('abstain', p_count)
            vote.set_count('absent', a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote('yes', re.sub(',', '', a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote('no', re.sub(',', '', a_links[i]).split()[0])
                else:
                    vote.vote('other', re.sub(',', '', a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Ejemplo n.º 51
0
    def scrape_bill_type(
            self,
            chamber,
            session,
            bill_type,
            type_abbr,
            committee_abbr_regex=get_committee_name_regex(),
    ):
        bills = (self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr))

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title="", chamber=chamber)
            if (bill_id.startswith("S")
                    and chamber == "lower") or (bill_id.startswith("A")
                                                and chamber == "upper"):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ("http://leginfo.legislature.ca.gov/faces/"
                          "billNavClient.xhtml?bill_id=%s") % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type="text/html")

            title = ""
            type_ = ["bill"]
            subject = ""
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = "//caml:DigestText/xhtml:p"
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r"\s+", " ", t)
                    t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t)
                    chunks.append(t)
                summary = "\n\n".join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime("%m/%d/%y")
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type="application/pdf",
                    date=version_date.date(),
                )

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ("AB", "SB"):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(
                            version.short_title) and not version.title.lower(
                            ).startswith("an act"):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == "Yes":
                    type_.append("appropriation")

                tags = []
                if version.fiscal_committee == "Yes":
                    tags.append("fiscal committee")
                if version.local_program == "Yes":
                    tags.append("local program")
                if version.urgency == "Yes":
                    tags.append("urgency")
                if version.taxlevy == "Yes":
                    tags.append("tax levy")

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note="summary")
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras["impact_clause"] = impact_clause
            fsbill.extras["tags"] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == "Y",
                    entity_type="person",
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {
                        "Assembly": "lower",
                        "Senate": "upper"
                    }[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                "Assembly": "lower",
                                "Senate": "upper"
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r"^(Assembly|Senate)", replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r"\s+", " ", act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r"Com[s]?. on",
                             action.action) and not matched_abbrs:
                    msg = "Failed to extract committee abbr from %r."
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ("Mapping contains no committee name for "
                                   "abbreviation %r. Action text was %r.")
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs["committees"] = committees

                    code = re.search(r"C[SXZ]\d+", actor)
                    if code is not None:
                        code = code.group()
                        kwargs["actor_info"] = {"committee_code": code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace("Coms. on ", "")
                        act_str = act_str.replace("Com. on " + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith("."):
                            act_str = act_str + "."

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ["upper", "lower", "legislature"]:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = "legislature"

                if actor != action.actor:
                    actor_info = kwargs.get("actor_info", {})
                    actor_info["details"] = action.actor
                    kwargs["actor_info"] = actor_info

                # Add strings for related legislators, if any.
                rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+"
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs["legislators"] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime("%Y-%m-%d"),
                    chamber=actor,
                    classification=kwargs["classification"],
                )
                for committee in kwargs.get("committees", []):
                    action.add_related_entity(committee,
                                              entity_type="organization")
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == "(PASS)":
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(" ")[0].lower()
                if first_part in ["asm", "assembly"]:
                    vote_chamber = "lower"
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith("sen"):
                    vote_chamber = "upper"
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ""
                else:
                    motion = ""

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = "passage"
                elif "Do Pass" in motion:
                    vtype = "passage"
                else:
                    vtype = "other"

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r"(\w+)( Extraordinary)? Session$",
                                    re.IGNORECASE).sub("", motion)
                motion = re.compile(r"^(Senate|Assembly) ",
                                    re.IGNORECASE).sub("", motion)
                motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ", "",
                                motion)
                motion = re.sub(r" \(\w+\)$", "", motion)
                motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "",
                                motion)
                motion = re.sub(
                    r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? "
                    r"Urgency Clause$",
                    "(Urgency Clause)",
                    motion,
                )
                motion = re.sub(r"\s+", " ", motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result="pass" if result else "fail",
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {"threshold": vote.threshold}

                source_url = ("http://leginfo.legislature.ca.gov/faces"
                              "/billVotesClient.xhtml?bill_id={}").format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + "#" + str(vote_num)

                rc = {"yes": [], "no": [], "other": []}
                for record in vote.votes:
                    if record.vote_code == "AYE":
                        rc["yes"].append(record.legislator_name)
                    elif record.vote_code.startswith("NO"):
                        rc["no"].append(record.legislator_name)
                    else:
                        rc["other"].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
Ejemplo n.º 52
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r'<.*?>'

        if session is None:
            session = self.latest_session()
        year_slug = session[5:]

        # Load all bills and resolutions via the private API
        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
            'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
            format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info['BillNumber']))

            # Create the bill using its basic information
            bill = Bill(identifier=info['BillNumber'],
                        legislative_session=session,
                        chamber=bill_chamber,
                        title=info['Title'],
                        classification=bill_type)
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li')
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                    replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[:5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsorship(name=sponsor_name,
                                         classification=sponsor_type,
                                         entity_type='person',
                                         primary=(sponsor_type == 'primary'))

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a |'
                '//ul[@class="bill-path"]//a')

            for version in versions:
                if version.xpath('text()'):
                    bill.add_version_link(
                        note=version.xpath('text()')[0],
                        url=version.xpath('@href')[0].replace(' ', '%20'),
                        media_type='application/pdf')

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode('utf-8')).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(
                    info['BillNumber']))
                yield bill
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v.strip() for k, v in action.items()}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'executive'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    assert chambers_passed == set("HS")
                    action_type = 'executive-signature'
                elif actor == 'lower' and any(
                        x.lower().startswith('aspassed')
                        for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("H")
                elif actor == 'upper' and any(
                        x.lower().startswith(' aspassed')
                        or x.lower().startswith('aspassed')
                        for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("S")
                else:
                    action_type = None

                bill.add_action(description=re.sub(HTML_TAGS_RE, "",
                                                   action['FullStatus']),
                                date=datetime.datetime.strftime(
                                    datetime.datetime.strptime(
                                        action['StatusDate'], '%m/%d/%Y'),
                                    '%Y-%m-%d'),
                                chamber=actor,
                                classification=action_type)

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = ('http://legislature.vermont.gov/bill/'
                                 'loadBillRollCallDetails/{0}/{1}'.format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name,
                     _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if "Passed -- " in vote['FullStatus']:
                    did_pass = True
                elif ("Failed -- " in vote['FullStatus'] or
                      'Veto of the Governor sustained' in vote['FullStatus']):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = int(
                    re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = int(
                    re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = VoteEvent(
                    chamber=('lower'
                             if vote['ChamberCode'] == 'H' else 'upper'),
                    start_date=datetime.datetime.strftime(
                        datetime.datetime.strptime(vote['StatusDate'],
                                                   '%m/%d/%Y'), '%Y-%m-%d'),
                    motion_text=re.sub(HTML_TAGS_RE, "",
                                       vote['FullStatus']).strip(),
                    result='pass' if did_pass else 'fail',
                    classification='passage',
                    legislative_session=session,
                    bill=info['BillNumber'],
                    bill_chamber=bill_chamber)
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count('yes', yea_count)
                vote_to_add.set_count('no', nay_count)
                vote_to_add.set_count('not voting', len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote('not voting', member)

                yield vote_to_add

            # Capture extra information-  Not yet implemented
            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            yield bill
Ejemplo n.º 53
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r'<.*?>'

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
            'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
            format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError(
                    "Unknown bill type found: '{}'".
                    format(info['BillNumber'])
                )

            bill_id = info['BillNumber'].replace('.', '').replace(' ', '')
            # put one space back in between type and number
            bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info['Title'],
                classification=bill_type
            )
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li'
            )
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                    replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[:5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type='person',
                        primary=(sponsor_type == 'primary')
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a |'
                '//ul[@class="bill-path"]//a'
            )

            for version in versions:
                if version.xpath('text()'):
                    bill.add_version_link(
                        note=version.xpath('text()')[0],
                        url=version.xpath('@href')[0].replace(' ', '%20'),
                        media_type='application/pdf'
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode('utf-8')
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(info['BillNumber']))
                yield bill
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v for k, v in action.items() if v is not None}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'executive'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    # assert chambers_passed == set("HS")
                    action_type = 'executive-signature'
                elif "Vetoed by the Governor" in action['FullStatus']:
                    action_type = 'executive-veto'
                elif "Read first time" in action['FullStatus'] \
                        or "Read 1st time" in action['FullStatus']:
                    action_type = 'introduction'
                elif "Reported favorably" in action['FullStatus']:
                    action_type = 'committee-passage-favorable'
                elif actor == 'lower' and any(x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("H")
                elif actor == 'upper' and any(x.lower().startswith(' aspassed')
                                              or x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("S")
                else:
                    action_type = None

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action['FullStatus']),
                    date=datetime.datetime.strftime(
                        datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    chamber=actor,
                    classification=action_type
                )

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = ('http://legislature.vermont.gov/bill/'
                                 'loadBillRollCallDetails/{0}/{1}'.format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name, _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote['FullStatus'] or
                        "Veto of Governor overridden" in vote['FullStatus']):
                    did_pass = True
                elif ("Failed -- " in vote['FullStatus'] or
                      'Veto of the Governor sustained' in vote['FullStatus']):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = VoteEvent(
                    bill=bill,
                    chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'),
                    start_date=datetime.datetime.strftime(
                        datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(),
                    result='pass' if did_pass else 'fail',
                    classification='passage',
                    legislative_session=session,
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count('yes', yea_count)
                vote_to_add.set_count('no', nay_count)
                vote_to_add.set_count('not voting', len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote('not voting', member)

                yield vote_to_add

            # Capture extra information-  Not yet implemented
            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            yield bill
Ejemplo n.º 54
0
    def scrape_vote(self, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath("//font/text()")[2]
        except IndexError:
            self.warning("Vote Summary Page Broken ")
            return

        # eg. http://leg.colorado.gov/content/sb18-033vote563ce6
        if ("AM" in motion or "PM" in motion) and "/" in motion:
            motion = "Motion not given."

        if "withdrawn" not in motion:
            yes_no_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Aye')]]/font/text()"
            )
            other_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Absent')]]/font/text()"
            )
            abstain_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'17C')]]/font/text()"
            )
            yes_count = int(yes_no_counts[0])
            no_count = int(yes_no_counts[2])
            exc_count = int(other_counts[2])
            absent_count = int(other_counts[0])
            abstain_count = 0
            if abstain_counts:
                abstain_count = int(abstain_counts[0])

            # fix for
            # http://leg.colorado.gov/content/hb19-1029vote65e72e
            if absent_count == -1:
                absent_count = 0

            passed = yes_count > no_count
            vote = VoteEvent(
                chamber=chamber,
                start_date=self._tz.localize(date),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.pupa_id = vote_url
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", exc_count)
            vote.set_count("absent", absent_count)
            vote.set_count("abstain", abstain_count)
            vote.add_source(vote_url)

            rolls = page.xpath(
                "//tr[preceding-sibling::tr/descendant::"
                "td/div/b/font[contains(text(),'Vote')]]"
            )

            vote_abrv = {
                "Y": "yes",
                "N": "no",
                "E": "excused",
                "A": "absent",
                "-": "absent",
                "17C": "abstain",
            }
            for roll in rolls:
                if len(roll.xpath(".//td/div/font/text()")) > 0:
                    voted = roll.xpath(".//td/div/font/text()")[0].strip()
                    voter = roll.xpath(".//td/font/text()")[0].strip()
                    if voted == "V":
                        continue
                    vote.vote(vote_abrv[voted], voter)
            yield vote
Ejemplo n.º 55
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) :
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date'] :
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try :
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError :
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []) :
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title: #these are ominbus
                    bill.add_related_bill(identifier = related_bill['label'],
                                          legislative_session = bill.legislative_session,
                                          relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other
                
            for i, sponsor in enumerate(leg_details.get('Sponsors', [])) :
                if i == 0 :
                    primary = True
                    sponsorship_type = "Primary"
                else :
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 
                                            'Mendoza, Susana')) :
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm',)) :
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(('Misc. Transmittal',
                                                'No Sponsor',
                                                'Dept./Agency')) :
                    bill.add_sponsorship(sponsor_name, 
                                         sponsorship_type,
                                         entity_type,
                                         primary,
                                         entity_id = _make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details :
                for subject in leg_details[u'Topic'].split(',') :
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []) :
                if attachment['label'] :
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']) :
                action_description = action['Action']
                try :
                    action_date =  self.toTime(action['Date']).date().isoformat()
                except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description :
                    try :
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError  :
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council' :
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=ACTION_CLASSIFICATION[action_description])

                    if action_description == 'Referred' :
                        try :
                            leg_details['Current Controlling Legislative Body']['label']
                            controlling_bodies = [leg_details['Current Controlling Legislative Body']]
                        except TypeError :
                            controlling_bodies = leg_details['Current Controlling Legislative Body']
                        if controlling_bodies :
                            for controlling_body in controlling_bodies :
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee") :
                                    act.add_related_entity(body_name,
                                                           'organization')
                                else :
                                    act.add_related_entity(body_name,
                                                           'organization',
                                                           entity_id = _make_pseudo_id(name=body_name))


                    if 'url' in action['Action\xa0Details'] :
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                               motion_text=action_description,
                                               organization={'name': responsible_org},
                                               classification=None,
                                               start_date=action_date,
                                               result=result,
                                               bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes :
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification' : leg_summary['Type']}
                            
            yield bill
        print(unreachable_urls)
Ejemplo n.º 56
0
    def process_vote(self, vote, bill, member_ids):
        try:
            motion = vote["ReadingDescription"]
        except KeyError:
            self.logger.warning(
                "Can't even figure out what we're voting on. Skipping.")
            return

        if "VoteResult" not in vote:
            if "postponed" in motion.lower():
                result = "Postponed"
                status = True  # because we're talking abtout the motion, not the amendment
            elif "tabled" in motion.lower():
                result = "Tabled"
                status = True
            else:
                self.logger.warning("Could not find result of vote, skipping.")
                return
        else:

            result = vote["VoteResult"].strip().lower()
            statuses = {
                "approved": 'pass',
                "disapproved": 'fail',
                "failed": 'fail',
                "declined": 'fail',
                "passed": 'pass'
            }

            try:
                status = statuses[result]
            except KeyError:
                self.logger.warning(
                    "Unexpected vote result '{result},' skipping vote.".format(
                        result=result))
                return

        date = self.date_format(vote["DateOfVote"])

        leg_votes = vote["MemberVotes"]
        v = VoteEvent(chamber='legislature',
                      start_date=date,
                      motion_text=motion,
                      result=status,
                      classification='passage',
                      bill=bill)
        yes_count = no_count = other_count = 0
        for leg_vote in leg_votes:
            mem_name = member_ids[int(leg_vote["MemberId"])]
            if leg_vote["Vote"] == "1":
                yes_count += 1
                v.yes(mem_name)
            elif leg_vote["Vote"] == "2":
                no_count += 1
                v.no(mem_name)
            else:
                other_count += 1
                v.vote('other', mem_name)

        v.set_count('yes', yes_count)
        v.set_count('no', no_count)
        v.set_count('other', other_count)

        # the documents for the readings are inside the vote
        # level in the json, so we'll deal with them here
        # and also add relevant actions

        if "amendment" in motion.lower():
            if status:
                t = "amendment-passage"
            elif result in ["Tabled", "Postponed"]:
                t = "amendment-deferral"
            else:
                t = "amendment-failure"
        elif "first reading" in motion.lower():
            t = "reading-1"
        elif "1st reading" in motion.lower():
            t = "reading-1"
        elif "second reading" in motion.lower():
            t = "reading-2"
        elif "2nd reading" in motion.lower():
            t = "reading-2"
        elif "third reading" in motion.lower():
            t = "reading-3"
        elif "3rd reading" in motion.lower():
            t = "reading-3"
        elif "final reading" in motion.lower():
            t = "reading-3"
        elif result in ["Tabled", "Postponed"]:
            t = None
        else:
            t = None

        bill.add_action(motion, date, classification=t)

        if "amendment" in t:
            vote["type"] = "amendment"
        elif "reading" in t:
            vote["type"] = t.replace("bill:", "")

        # some documents/versions are hiding in votes.
        if "AttachmentPath" in vote:
            is_version = False
            try:
                if vote["DocumentType"] in [
                        "enrollment", "engrossment", "introduction"
                ]:
                    is_version = True
            except KeyError:
                pass

            if motion in ["enrollment", "engrossment", "introduction"]:
                is_version = True

            self.add_documents(vote["AttachmentPath"], bill, is_version)

        return v
    def _scrape_upper_chamber(self, session):
        if int(session[:4]) >= 2016:
            if len(session) == 4:
                # regular session
                url = 'http://www.senate.mo.gov/%sinfo/jrnlist/default.aspx' % (
                    session[-2:], )
            else:
                # special session
                url = 'http://www.senate.mo.gov/%sinfo/jrnlist/%sJournals.aspx' % (
                    session[-4:-2], session[-2:])
        else:
            url = 'http://www.senate.mo.gov/%sinfo/jrnlist/journals.aspx' % (
                session[-2:])

        vote_types = {
            'YEAS': 'yes',
            'NAYS': 'no',
            'Absent with leave': 'other',
            'Absent': 'other',
            'Vacancies': 'other',
        }

        page = self.lxmlize(url)
        journs = page.xpath("//table")[0].xpath(".//a")
        for a in journs:
            pdf_url = a.attrib['href']
            data = self._get_pdf(pdf_url).decode()
            lines = data.split("\n")

            in_vote = False
            cur_date = None
            vote_type = 'other'
            cur_bill = ''
            cur_motion = ''
            bc = None
            vote = {}
            counts = collections.defaultdict(int)

            for line in lines:
                line = line.strip()

                if cur_date is None:
                    matches = re.findall(date_re, line)
                    if matches != []:
                        date = matches[0]
                        date = "%s, %s %s, %s" % date
                        date = dt.datetime.strptime(date, "%A, %B %d, %Y")
                        cur_date = date

                matches = re.findall(motion_re, line)
                if matches != []:
                    cont = False
                    for x in matches:
                        if "vote" in x.lower():
                            cur_motion = x
                            bill = re.findall(bill_re, x)
                            if bill != []:
                                bc = {
                                    'H': 'lower',
                                    'S': 'upper',
                                    'J': 'legislature'
                                }[bill[0][0]]

                                cur_bill = "%s%s%s %s" % bill[0]
                            in_vote = True
                            cont = True
                    if cont:
                        continue
                if in_vote:
                    if is_vote_end(line):
                        in_vote = False
                        yes, no, other = counts['yes'], counts['no'], counts[
                            'other']
                        if bc is None:
                            continue

                        v = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            motion_text=cur_motion,
                            result='pass' if yes > no else 'fail',
                            legislative_session=session,
                            classification='passage',
                            bill=cur_bill,
                            bill_chamber=bc,
                        )

                        v.add_source(url)
                        v.add_source(pdf_url)

                        v.set_count('yes', yes)
                        v.set_count('no', no)
                        v.set_count('other', other)

                        for key in vote:
                            for person in vote[key]:
                                v.vote(key, person)

                        yield v
                        vote = {}
                        counts = collections.defaultdict(int)
                        continue
                    if "Journal of the Senate" in line:
                        continue
                    if re.match(
                            r".*(Monday|Tuesday|Wednesday|Thursday|Friday|"
                            "Saturday|Sunday), .* \d+, \d+.*", line):
                        continue

                    found = False
                    rl = None
                    for vote_type in list(vote_types):
                        if line.lower().startswith(vote_type.lower()):
                            if "none" in line.lower():
                                continue

                            if "Senator" in line and "Senators" not in line:
                                line = self._clean_line(line)
                                line = line[len(vote_type):]
                                line = line.replace("-Senator ", "")
                                rl = line
                            vote_category = vote_types[vote_type]
                            found = True
                            if vote_category not in vote:
                                vote[vote_category] = []
                    if found and rl is None:
                        continue
                    elif rl:
                        line = rl

                    names = [self._clean_line(x) for x in line.strip().split()]
                    if names == []:
                        continue

                    lname = names[-1]
                    lname = lname.rsplit("-", 1)
                    if len(lname) > 1:
                        person, count = lname
                        if count.isdigit() is False:
                            continue

                        names.pop(-1)
                        names.append(person)
                        counts[vote_category] += int(count)

                    for name in names:
                        vote[vote_category].append(name)
Ejemplo n.º 58
0
    def scrape(self, window=28, matter_ids=None) :
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.

        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [self.matter(matter_id) for matter_id in matter_ids.split(',')]
            matters = filter(None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for matter in matters:
            # Skip this bill, until Metro cleans up duplicate in Legistar API
            if matter['MatterFile'] == '2017-0447':
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            # Do not scrape private bills introduced before this timestamp.
            if self._is_restricted(matter) and (date < self.START_DATE_PRIVATE_SCRAPE):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            # The Metro scraper scrapes private bills.
            # However, we do not want to capture significant data about private bills,
            # other than the value of the helper function `_is_restricted` and a last modified timestamp.
            # We yield private bills early, wipe data from previously imported once-public bills,
            # and include only data *required* by the pupa schema.
            # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
            bill.extras = {'restrict_view' : self._is_restricted(matter)}

            # Add API source early.
            # Private bills should have this url for debugging.
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
            bill.add_source(legistar_api, note='api')

            if self._is_restricted(matter):
                # required fields
                bill.title = 'Restricted View'

                # wipe old data
                bill.extras['plain_text'] = ''
                bill.extras['rtf_text'] = ''
                bill.sponsorships = []
                bill.related_bills = []
                bill.versions = []
                bill.documents = []
                bill.actions = []

                yield bill
                continue

            legistar_web = matter['legistar_url']
            bill.add_source(legistar_web, note='web')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(body_name,
                                           'organization',
                                           entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session,
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        try:
                            raw_option = vote['VoteValueName'].lower()
                        except AttributeError:
                            raw_option = None
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(identifier=identifier,
                                          legislative_session=related_bill_session,
                                          relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link('Board Report',
                                  'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id),
                                   media_type="application/pdf")

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_document_link(attachment['MatterAttachmentName'],
                                           attachment['MatterAttachmentHyperlink'],
                                           media_type="application/pdf")

            bill.extras['local_classification'] = matter['MatterTypeName']

            matter_version_value = matter['MatterVersion']
            text = self.text(matter_id, matter_version_value)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Ejemplo n.º 59
0
    def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy):
        result_types = {
            'FAILED': False,
            'DEFEATED': False,
            'PREVAILED': True,
            'PASSED': True,
            'SUSTAINED': True,
            'NOT SECONDED': False,
            'OVERRIDDEN': True,
            'ADOPTED': True,
        }

        for r in rollcalls:
            proxy_link = proxy["url"] + r["link"]
            (path, resp) = self.urlretrieve(proxy_link)
            text = convert_pdf(path, 'text').decode("utf-8")
            lines = text.split("\n")
            os.remove(path)

            chamber = "lower" if "house of representatives" in lines[0].lower() else "upper"
            date_parts = lines[1].strip().split()[-3:]
            date_str = " ".join(date_parts).title() + " " + lines[2].strip()

            vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p")
            vote_date = pytz.timezone('America/Indiana/Indianapolis').localize(vote_date)
            vote_date = vote_date.isoformat()

            passed = None

            for res, val in result_types.items():
                # We check multiple lines now because the result of the
                # roll call vote as parsed can potentially be split.
                # PDF documents suck.
                for line in lines[3:5]:
                    if res in line.upper():
                        passed = val
                        break

            if passed is None:
                raise AssertionError("Missing bill passage type")

            motion = " ".join(lines[4].split()[:-2])
            try:
                yeas = int(lines[4].split()[-1])
                nays = int(lines[5].split()[-1])
                excused = int(lines[6].split()[-1])
                not_voting = int(lines[7].split()[-1])
            except ValueError:
                self.logger.warning("Vote format is weird, skipping")
                continue

            vote = VoteEvent(chamber=chamber,
                             legislative_session=session,
                             bill=bill_id,
                             bill_chamber=original_chamber,
                             start_date=vote_date,
                             motion_text=motion,
                             result="pass" if passed else "fail",
                             classification="passage")

            vote.set_count('yes', yeas)
            vote.set_count('no', nays)
            vote.set_count('excused', excused)
            vote.set_count('not voting', not_voting)
            vote.add_source(proxy_link)

            currently_counting = ""

            possible_vote_lines = lines[8:]
            for l in possible_vote_lines:
                l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING")
                l = l.replace("\xc2\xa0", " -")
                if "yea-" in l.lower().replace(" ", ""):
                    currently_counting = "yes"
                elif "nay-" in l.lower().replace(" ", ""):
                    currently_counting = "no"
                elif "excused-" in l.lower().replace(" ", ""):
                    currently_counting = "excused"
                elif "notvoting-" in l.lower().replace(" ", ""):
                    currently_counting = "not voting"
                elif currently_counting == "":
                    pass
                elif re.search(r'v\. \d\.\d', l):
                    # this gets rid of the version number
                    # which is often found at the bottom of the doc
                    pass
                else:
                    voters = l.split("  ")
                    for v in voters:
                        if v.strip():
                            vote.vote(currently_counting, v.strip())

            yield vote
Ejemplo n.º 60
0
    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h3[contains(@id, 'hdVote')])")

        if "No Bill Action" in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(", ")[1]

        if location.startswith("House"):
            chamber = "lower"
        elif location.startswith("Senate"):
            chamber = "upper"
        elif location.startswith("Joint"):
            chamber = "legislature"
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        motion = ", ".join(header.split(", ")[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//span[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//span[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//span[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//span[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith("Do Pass"):
                type = "passage"
            elif motion == "Concurred in amendments":
                type = "amendment"
            elif motion == "Veto override":
                type = "veto_override"
            else:
                type = "other"

            vote = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=motion,
                result="pass" if passed else "fail",
                classification=type,
                bill=bill,
            )
            # The vote page URL has a unique ID
            # However, some votes are "consent calendar" events,
            # and relate to the passage of _multiple_ bills
            # These can't be modeled yet in Pupa, but for now we can
            # append a bill ID to the URL that forms the `pupa_id`
            # https://github.com/opencivicdata/pupa/issues/308
            vote.pupa_id = "{}#{}".format(url,
                                          bill.identifier.replace(" ", ""))

            vote.add_source(url)
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", excused_count)
            vote.set_count("absent", absent_count)

            for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"):
                option_or_person = td.text.strip()
                if option_or_person in ("Aye", "Yea"):
                    vote.yes(td.getprevious().text.strip())
                elif option_or_person == "Nay":
                    vote.no(td.getprevious().text.strip())
                elif option_or_person == "Excused":
                    vote.vote("excused", td.getprevious().text.strip())
                elif option_or_person == "Absent":
                    vote.vote("absent", td.getprevious().text.strip())

            yield vote