Ejemplo n.º 1
0
def test_vote():
    v = Vote('upper', datetime.datetime(2012, 1, 1), 'passage', True,
             3, 1, 2, note='note')
    assert_equal(v, {'chamber': 'upper', 'date': datetime.datetime(2012, 1, 1),
                     'motion': 'passage', 'passed': True, 'yes_count': 3,
                     'no_count': 1, 'other_count': 2, 'type': 'other',
                     'yes_votes': [], 'no_votes': [], 'other_votes': [],
                     'note': 'note', '_type': 'vote', 'sources': []})

    yes_voters = ['Lincoln', 'Adams', 'Johnson']
    list(map(v.yes, yes_voters))
    assert_equal(v['yes_votes'], yes_voters)

    no_voters = ['Kennedy']
    list(map(v.no, no_voters))
    assert_equal(v['no_votes'], no_voters)

    other_voters = ['Polk', 'Pierce']
    list(map(v.other, other_voters))
    assert_equal(v['other_votes'], other_voters)

    # validate should work
    v.validate()

    # now add someone else and make sure it doesn't validate
    v.yes('Clinton')
    with assert_raises(ValueError):
        v.validate()
Ejemplo n.º 2
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r'Ayes,? (\d+)[,;]\s+N(?:oes|ays),? (\d+)', text)
        (yes, no) = int(votes[0][0]), int(votes[0][1])

        vtype = 'other'
        for regex, type in motion_classifiers.iteritems():
            if re.match(regex, text):
                vtype = type
                break

        v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if 'av' in url:
                self.add_house_votes(v, url)
            elif 'sv' in url:
                self.add_senate_votes(v, url)

        # other count is brute forced
        v['other_count'] = len(v['other_votes'])
        v.validate()
        bill.add_vote(v)
Ejemplo n.º 3
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r'Ayes,? (\d+)[,;]\s+N(?:oes|ays),? (\d+)', text)
        (yes, no) = int(votes[0][0]), int(votes[0][1])

        vtype = 'other'
        for regex, type in motion_classifiers.iteritems():
            if re.match(regex, text):
                vtype = type
                break

        v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if 'av' in url:
                self.add_house_votes(v, url)
            elif 'sv' in url:
                self.add_senate_votes(v, url)

        # other count is brute forced
        v['other_count'] = len(v['other_votes'])
        v.validate()
        bill.add_vote(v)
Ejemplo n.º 4
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        try:
            motion = text.split('\n')[4].strip()
        except IndexError:
            return

        try:
            yes_count = int(re.search(r'Yeas - (\d+)', text).group(1))
        except AttributeError:
            return

        no_count = int(re.search(r'Nays - (\d+)', text).group(1))
        other_count = int(re.search(r'Not Voting - (\d+)', text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        y,n,o = 0,0,0
        break_outter = False

        for line in text.split('\n')[9:]:
            if break_outter:
                break

            if 'after roll call' in line:
                break
            if 'Indication of Vote' in line:
                break
            if 'Presiding' in line:
                continue

            for col in re.split(r'-\d+', line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col)

                if match:
                    if match.group(2) == "PAIR":
                        break_outter = True
                        break
                    if match.group(1) == 'Y':
                        vote.yes(match.group(2))
                    elif match.group(1) == 'N':
                        vote.no(match.group(2))
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Ejemplo n.º 5
0
    def scrape_vote(self, bill, action_text, url):
        doc = lxml.html.fromstring(self.urlopen(url))

        date = None
        yes_count = no_count = other_count = None

        # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12"
        if action_text.startswith('Vote - Senate Floor - '):
            action_text = action_text[22:]
            chamber = 'upper'
        elif action_text.startswith('Vote - House Floor - '):
            action_text = action_text[21:]
            chamber = 'lower'

        motion, unused_date = action_text.split(' - ')
        yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0]
        if 'Passed' in motion:
            motion = motion.split(' Passed')[0]
            passed = True
        elif 'Adopted' in motion:
            motion = motion.split(' Adopted')[0]
            passed = True
        elif 'Rejected' in motion:
            motion = motion.split(' Rejected')[0]
            passed = False
        elif 'Floor Amendment' in motion:
            passed = int(yes_count) > int(no_count)
        else:
            raise Exception('unknown motion: %s' % motion)

        vote = Vote(chamber=chamber, date=None, motion=motion,
                    yes_count=int(yes_count), no_count=int(no_count),
                    other_count=0, passed=passed)
        vfunc = None

        nobrs = doc.xpath('//nobr/text()')
        for text in nobrs:
            text = text.replace(u'\xa0', ' ')
            if text.startswith('Calendar Date: '):
                vote['date'] = datetime.datetime.strptime(text.split(': ', 1)[1], '%b %d, %Y %H:%M %p')
            elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text:
                self.debug(text)
                yeas, nays, nv, exc, absent = re.match('(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups()
                vote['yes_count'] = int(yeas)
                vote['no_count'] = int(nays)
                vote['other_count'] = int(nv) + int(exc) + int(absent)
            elif 'Voting Yea' in text:
                vfunc = vote.yes
            elif 'Voting Nay' in text:
                vfunc = vote.no
            elif 'Not Voting' in text or 'Excused' in text:
                vfunc = vote.other
            elif vfunc:
                vfunc(text)

        vote.validate()
        vote.add_source(url)
        bill.add_vote(vote)
Ejemplo n.º 6
0
    def scrape_votes(self, bill, link):
        with self.urlopen(link) as page:
            page = lxml.html.fromstring(page)
            raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content()
            raw_vote_data = re.split("\w+? by [\w ]+?\s+-", raw_vote_data.strip())[1:]
            for raw_vote in raw_vote_data:
                raw_vote = raw_vote.split(u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0")
                motion = raw_vote[0]

                vote_date = re.search("(\d+/\d+/\d+)", motion)
                if vote_date:
                    vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y")

                passed = "Passed" in motion or "Recommended for passage" in motion or "Adopted" in raw_vote[1]
                vote_regex = re.compile("\d+$")
                aye_regex = re.compile("^.+voting aye were: (.+) -")
                no_regex = re.compile("^.+voting no were: (.+) -")
                other_regex = re.compile("^.+present and not voting were: (.+) -")
                yes_count = 0
                no_count = 0
                other_count = 0
                ayes = []
                nos = []
                others = []

                for v in raw_vote[1:]:
                    v = v.strip()
                    if v.startswith("Ayes...") and vote_regex.search(v):
                        yes_count = int(vote_regex.search(v).group())
                    elif v.startswith("Noes...") and vote_regex.search(v):
                        no_count = int(vote_regex.search(v).group())
                    elif v.startswith("Present and not voting...") and vote_regex.search(v):
                        other_count += int(vote_regex.search(v).group())
                    elif aye_regex.search(v):
                        ayes = aye_regex.search(v).groups()[0].split(", ")
                    elif no_regex.search(v):
                        nos = no_regex.search(v).groups()[0].split(", ")
                    elif other_regex.search(v):
                        others += other_regex.search(v).groups()[0].split(", ")

                if "ChamberVoting=H" in link:
                    chamber = "lower"
                else:
                    chamber = "upper"

                vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count)
                vote.add_source(link)
                for a in ayes:
                    vote.yes(a)
                for n in nos:
                    vote.no(n)
                for o in others:
                    vote.other(o)

                vote.validate()
                bill.add_vote(vote)

        return bill
Ejemplo n.º 7
0
def test_vote():
    v = Vote('upper',
             datetime.datetime(2012, 1, 1),
             'passage',
             True,
             3,
             1,
             2,
             note='note')
    assert_equal(
        v, {
            'chamber': 'upper',
            'date': datetime.datetime(2012, 1, 1),
            'motion': 'passage',
            'passed': True,
            'yes_count': 3,
            'no_count': 1,
            'other_count': 2,
            'type': 'other',
            'yes_votes': [],
            'no_votes': [],
            'other_votes': [],
            'note': 'note',
            '_type': 'vote',
            'sources': []
        })

    yes_voters = ['Lincoln', 'Adams', 'Johnson']
    list(map(v.yes, yes_voters))
    assert_equal(v['yes_votes'], yes_voters)

    no_voters = ['Kennedy']
    list(map(v.no, no_voters))
    assert_equal(v['no_votes'], no_voters)

    other_voters = ['Polk', 'Pierce']
    list(map(v.other, other_voters))
    assert_equal(v['other_votes'], other_voters)

    # validate should work
    v.validate()

    # now add someone else and make sure it doesn't validate
    v.yes('Clinton')
    with assert_raises(ValueError):
        v.validate()
Ejemplo n.º 8
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        motion = text.split('\n')[4].strip()

        yes_count = int(re.search(r'Yeas - (\d+)', text).group(1))
        no_count = int(re.search(r'Nays - (\d+)', text).group(1))
        other_count = int(re.search(r'Not Voting - (\d+)', text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in text.split('\n')[9:]:
            if 'after roll call' in line:
                break
            if 'Indication of Vote' in line:
                break
            if 'Presiding' in line:
                continue

            for col in re.split(r'-\d+', line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col)
                if match:
                    if match.group(1) == 'Y':
                        vote.yes(match.group(2))
                    elif match.group(1) == 'N':
                        vote.no(match.group(2))
                    elif match.group(1) == '*':
                        pass # skip paired voters, don't factor into count
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Ejemplo n.º 9
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, "text")
        os.remove(path)

        motion = text.split("\n")[4].strip()

        yes_count = int(re.search(r"Yeas - (\d+)", text).group(1))
        no_count = int(re.search(r"Nays - (\d+)", text).group(1))
        other_count = int(re.search(r"Not Voting - (\d+)", text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count)
        vote.add_source(url)

        for line in text.split("\n")[9:]:
            if "after roll call" in line:
                break
            if "Indication of Vote" in line:
                break
            if "Presiding" in line:
                continue

            for col in re.split(r"-\d+", line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r"(Y|N|EX)\s+(.+)$", col)
                if match:
                    if match.group(1) == "Y":
                        vote.yes(match.group(2))
                    elif match.group(1) == "N":
                        vote.no(match.group(2))
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Ejemplo n.º 10
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        motion = text.split('\n')[4].strip()

        yes_count = int(re.search(r'Yeas - (\d+)', text).group(1))
        no_count = int(re.search(r'Nays - (\d+)', text).group(1))
        other_count = int(re.search(r'Not Voting - (\d+)', text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in text.split('\n')[9:]:
            if 'after roll call' in line:
                break
            if 'Presiding' in line:
                continue

            for col in re.split(r'-\d+', line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r'(Y|N|EX)\s+(.+)$', col)
                if match:
                    if match.group(1) == 'Y':
                        vote.yes(match.group(2))
                    elif match.group(1) == 'N':
                        vote.no(match.group(2))
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Ejemplo n.º 11
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r"Ayes (\d+)\, N(?:oes|ays) (\d+)", text)
        (yes, no) = int(votes[0][0]), int(votes[0][1])

        vtype = "other"
        for regex, type in motion_classifiers.iteritems():
            if re.match(regex, text):
                vtype = type
                break

        v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if "av" in url:
                self.add_house_votes(v, url)
            elif "sv" in url:
                self.add_senate_votes(v, url)

        v.validate()
        bill.add_vote(v)
Ejemplo n.º 12
0
    def scrape_lower_committee_votes(self, session_number, bill):
        '''
        House committee roll calls are not available on the Senate's
        website. Furthermore, the House uses an internal ID system in
        its URLs, making accessing those pages non-trivial.

        This function will fetch all the House committee votes for the
        given bill, and add the votes to that object.
        '''

        house_url = 'http://www.myfloridahouse.gov/Sections/Bills/bills.aspx'

        # Keep the digits and all following characters in the bill's ID
        bill_number = re.search(r'^\w+\s(\d+\w*)$', bill['bill_id']).group(1)

        form = {
            'rblChamber': 'B',
            'ddlSession': session_number,
            'ddlBillList': '-1',
            'txtBillNumber': bill_number,
            'ddlSponsor': '-1',
            'ddlReferredTo': '-1',
            'SubmittedByControl': '',
        }
        doc = lxml.html.fromstring(self.post(url=house_url, data=form).text)
        doc.make_links_absolute(house_url)

        (bill_link, ) = doc.xpath(
            '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href')
        bill_doc = self.lxmlize(bill_link)
        links = bill_doc.xpath('//a[text()="See Votes"]/@href')

        for link in links:
            vote_doc = self.lxmlize(link)

            (date, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()')
            date = datetime.datetime.strptime(date,
                                              '%m/%d/%Y %I:%M:%S %p').date()

            totals = vote_doc.xpath('//table//table')[-1].text_content()
            totals = re.sub(r'(?mu)\s+', " ", totals).strip()
            (yes_count, no_count, other_count) = [
                int(x) for x in re.search(
                    r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+'
                    'Total Missed:\s+(\d+)', totals).groups()
            ]
            passed = yes_count > no_count

            (committee, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()')
            (action, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()')
            motion = "{} ({})".format(action, committee)

            vote = Vote('lower', date, motion, passed, yes_count, no_count,
                        other_count)
            vote.add_source(link)

            for member_vote in vote_doc.xpath('//table//table//table//td'):
                if not member_vote.text_content().strip():
                    continue

                (member, ) = member_vote.xpath('span[2]//text()')
                (member_vote, ) = member_vote.xpath('span[1]//text()')

                if member_vote == "Y":
                    vote.yes(member)
                elif member_vote == "N":
                    vote.no(member)
                elif member_vote == "-":
                    vote.other(member)
                # Parenthetical votes appear to not be counted in the
                # totals for Yea, Nay, _or_ Missed
                elif re.search(r'\([YN]\)', member_vote):
                    continue
                else:
                    raise IndexError(
                        "Unknown vote type found: {}".format(member_vote))

            vote.validate()
            bill.add_vote(vote)
Ejemplo n.º 13
0
    def scrape_uppper_committee_vote(self, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        (_, motion) = lines[5].split("FINAL ACTION:")
        motion = motion.strip()
        if not motion:
            self.warning("Vote appears to be empty")
            return

        vote_top_row = [
            lines.index(x) for x in lines
            if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x)
        ][0]
        yea_columns_end = lines[vote_top_row].index("Yea") + len("Yea")
        nay_columns_begin = lines[vote_top_row].index("Nay")

        votes = {'yes': [], 'no': [], 'other': []}
        for line in lines[(vote_top_row + 1):]:
            if line.strip():
                member = re.search(
                    r'''(?x)
                        ^\s+(?:[A-Z\-]+)?\s+  # Possible vote indicator
                        ([A-Z][a-z]+  # Name must have lower-case characters
                        [\w\-\s]+)  # Continue looking for the rest of the name
                        (?:,[A-Z\s]+?)?  # Leadership has an all-caps title
                        (?:\s{2,}.*)?  # Name ends when many spaces are seen
                        ''', line).group(1)
                # Usually non-voting members won't even have a code listed
                # Only a couple of codes indicate an actual vote:
                # "VA" (vote after roll call) and "VC" (vote change)
                did_vote = bool(re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line))
                if did_vote:
                    # Check where the "X" or vote code is on the page
                    vote_column = len(line) - len(line.lstrip())
                    if vote_column <= yea_columns_end:
                        votes['yes'].append(member)
                    elif vote_column >= nay_columns_begin:
                        votes['no'].append(member)
                    else:
                        raise AssertionError(
                            "Unparseable vote found for {0} in {1}:\n{2}".
                            format(member, url, line))
                else:
                    votes['other'].append(member)

            # End loop as soon as no more members are found
            else:
                break

        totals = re.search(r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS',
                           text).groups()
        yes_count = int(totals[0])
        no_count = int(totals[1])
        passed = (yes_count > no_count)
        other_count = len(votes['other'])

        vote = Vote('upper', date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)
        vote['yes_votes'] = votes['yes']
        vote['no_votes'] = votes['no']
        vote['other_votes'] = votes['other']

        vote.validate()
        bill.add_vote(vote)
Ejemplo n.º 14
0
    def scrape_floor_vote(self, chamber, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        MOTION_INDEX = 4
        TOTALS_INDEX = 6
        VOTE_START_INDEX = 9

        motion = lines[MOTION_INDEX].strip()
        # Sometimes there is no motion name, only "Passage" in the line above
        if (not motion
                and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")):
            motion = lines[MOTION_INDEX - 1]
            MOTION_INDEX -= 1
            TOTALS_INDEX -= 1
            VOTE_START_INDEX -= 1
        else:
            assert motion, "Floor vote's motion name appears to be empty"

        for _extra_motion_line in range(2):
            MOTION_INDEX += 1
            if lines[MOTION_INDEX].strip():
                motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip())
                TOTALS_INDEX += 1
                VOTE_START_INDEX += 1
            else:
                break

        (yes_count, no_count, other_count) = [
            int(x) for x in re.search(
                r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$',
                lines[TOTALS_INDEX]).groups()
        ]
        passed = (yes_count > no_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in lines[VOTE_START_INDEX:]:
            if not line.strip():
                break

            if " President " in line:
                line = line.replace(" President ", " ")
            elif " Speaker " in line:
                line = line.replace(" Speaker ", " ")

            # Votes follow the pattern of:
            # [vote code] [member name]-[district number]
            for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line):
                vote.yes(member)
            for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line):
                vote.no(member)
            for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line):
                vote.other(member)

        try:
            vote.validate()
        except ValueError:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.logger.info("Votes don't add up; looking for additional ones")
            for line in lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}',
                                         line):
                    vote.other(member)

        vote.validate()
        bill.add_vote(vote)
Ejemplo n.º 15
0
    def scrape_vote(self, bill, action_text, url):
        doc = lxml.html.fromstring(self.urlopen(url))

        date = None
        yes_count = no_count = other_count = None

        # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12"
        if action_text.startswith('Vote - Senate Floor - '):
            action_text = action_text[22:]
            chamber = 'upper'
        elif action_text.startswith('Vote - House Floor - '):
            action_text = action_text[21:]
            chamber = 'lower'

        motion, unused_date = action_text.rsplit(' - ', 1)
        yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0]
        if 'Passed' in motion:
            motion = motion.split(' Passed')[0]
            passed = True
        elif 'Adopted' in motion:
            motion = motion.split(' Adopted')[0]
            passed = True
        elif 'Rejected' in motion:
            motion = motion.split(' Rejected')[0]
            passed = False
        elif 'Failed' in motion:
            motion = motion.split(' Failed')[0]
            passed = False
        elif 'Floor Amendment' in motion:
            passed = int(yes_count) > int(no_count)
        else:
            raise Exception('unknown motion: %s' % motion)

        vote = Vote(chamber=chamber,
                    date=None,
                    motion=motion,
                    yes_count=int(yes_count),
                    no_count=int(no_count),
                    other_count=0,
                    passed=passed)
        vfunc = None

        nobrs = doc.xpath('//nobr/text()')
        for text in nobrs:
            text = text.replace(u'\xa0', ' ')
            if text.startswith('Calendar Date: '):
                vote['date'] = datetime.datetime.strptime(
                    text.split(': ', 1)[1], '%b %d, %Y %H:%M %p')
            elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text:
                self.debug(text)
                yeas, nays, nv, exc, absent = re.match(
                    '(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent',
                    text).groups()
                vote['yes_count'] = int(yeas)
                vote['no_count'] = int(nays)
                vote['other_count'] = int(nv) + int(exc) + int(absent)
            elif 'Voting Yea' in text:
                vfunc = vote.yes
            elif 'Voting Nay' in text:
                vfunc = vote.no
            elif 'Not Voting' in text or 'Excused' in text:
                vfunc = vote.other
            elif vfunc:
                vfunc(text)

        vote.validate()
        vote.add_source(url)
        bill.add_vote(vote)
Ejemplo n.º 16
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' '))

        re_ns = "http://exslt.org/regular-expressions"
        path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={'re': re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if 'HOUSE' in header.xpath("string()"):
                chamber = 'lower'
                motion_index = 8
            else:
                chamber = 'upper'
                motion_index = 13

            motion = header.xpath("string(following-sibling::p[%d])" %
                                  motion_index).strip()
            motion = re.sub(r'\s+', ' ', motion)
            assert motion.strip(), "Motion text not found"
            match = re.match(r'^(.*) (PASSED|FAILED)$', motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == 'PASSED'
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ')
            rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r'\d+/\d+/\d+', date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace('\r\n', ' ').strip()
                if "*****" in line:
                    break

                match = re.match(
                    r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)',
                    line)
                if match:
                    if match.group(1) == 'YEAS' and 'RCS#' not in line:
                        vtype = 'yes'
                        seen_yes = True
                    elif match.group(1) == 'NAYS' and seen_yes:
                        vtype = 'no'
                    elif match.group(1) == 'VACANT':
                        continue  # skip these
                    elif seen_yes:
                        vtype = 'other'
                    if seen_yes and match.group(3).strip():
                        self.logger.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split('   '):
                        if not name:
                            continue
                        if 'HOUSE' in name or 'SENATE ' in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts['yes'] > (counts['no'] + counts['other'])

            vote = Vote(chamber,
                        date,
                        motion,
                        passed,
                        counts['yes'],
                        counts['no'],
                        counts['other'],
                        rcs_num=rcs)
            vote.validate()

            vote.add_source(url)

            for name in votes['yes']:
                vote.yes(name)
            for name in votes['no']:
                if ':' in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes['other']:
                vote.other(name)

            vote.validate()
            bill.add_vote(vote)
Ejemplo n.º 17
0
    def scrape_votes(self, bill, link):
        page = self.urlopen(link)
        page = lxml.html.fromstring(page)
        raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content()
        raw_vote_data = re.split('\w+? by [\w ]+?\s+-', raw_vote_data.strip())[1:]
        for raw_vote in raw_vote_data:
            raw_vote = raw_vote.split(u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0')
            motion = raw_vote[0]

            vote_date = re.search('(\d+/\d+/\d+)', motion)
            if vote_date:
                vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y')

            passed = ('Passed' in motion or
                      'Recommended for passage' in motion or
                      'Adopted' in raw_vote[1]
                     )
            vote_regex = re.compile('\d+$')
            aye_regex = re.compile('^.+voting aye were: (.+) -')
            no_regex = re.compile('^.+voting no were: (.+) -')
            other_regex = re.compile('^.+present and not voting were: (.+) -')
            yes_count = 0
            no_count = 0
            other_count = 0
            ayes = []
            nos = []
            others = []

            for v in raw_vote[1:]:
                v = v.strip()
                if v.startswith('Ayes...') and vote_regex.search(v):
                    yes_count = int(vote_regex.search(v).group())
                elif v.startswith('Noes...') and vote_regex.search(v):
                    no_count = int(vote_regex.search(v).group())
                elif v.startswith('Present and not voting...') and vote_regex.search(v):
                    other_count += int(vote_regex.search(v).group())
                elif aye_regex.search(v):
                    ayes = aye_regex.search(v).groups()[0].split(', ')
                elif no_regex.search(v):
                    nos = no_regex.search(v).groups()[0].split(', ')
                elif other_regex.search(v):
                    others += other_regex.search(v).groups()[0].split(', ')

            if 'ChamberVoting=H' in link:
                chamber = 'lower'
            else:
                chamber = 'upper'

            vote = Vote(chamber, vote_date, motion, passed, yes_count,
                        no_count, other_count)
            vote.add_source(link)
            for a in ayes:
                vote.yes(a)
            for n in nos:
                vote.no(n)
            for o in others:
                vote.other(o)

            vote.validate()
            bill.add_vote(vote)

        return bill
Ejemplo n.º 18
0
    def scrape(self, session, chambers):
        HTML_TAGS_RE = r'<.*?>'

        year_slug = session[5: ]
        
        # Load all bills and resolutions via the private API
        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data']

        resolutions_url = \
                'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
                format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = { k:v.strip() for k, v in info.iteritems() }

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError(
                        "Unknown bill type found: '{}'".
                        format(info['BillNumber']))

            # Create the bill using its basic information
            bill = Bill(
                    session=session,
                    bill_id=info['BillNumber'],
                    title=info['Title'],
                    chamber=bill_chamber,
                    type=bill_type
                    )
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = \
                    'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                    format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                    '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                    'following-sibling::dd[1]/ul/li'
                    )
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                        replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsor(sponsor_type, sponsor_name)

            # Capture bill text versions
            versions = doc.xpath(
                    '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                    'following-sibling::dd[1]/ul/li/a'
                    )
            for version in versions:
                bill.add_version(
                        name=version.xpath('text()')[0],
                        url=version.xpath('@href')[0].replace(' ', '%20'),
                        mimetype='application/pdf'
                        )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                        r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format(year_slug),
                        lxml.etree.tostring(doc)
                        ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".\
                        format(info['BillNumber']))
                self.save_bill(bill)
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = { k:v.strip() for k, v in action.iteritems() }

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'governor'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    assert chambers_passed == set("HS")
                    action_type = 'governor:signed'
                elif actor == 'lower' and \
                        action['FullStatus'] in (
                        "Passed",
                        "Read Third time and Passed",
                        "Adopted",
                        "Adopted in Concurrence",
                        "Read and Adopted",
                        "Read and Adopted in Concurrence",
                        "Passed in Concurrence",
                        "Passed in Concurrence with Proposal of Amendment"):
                    action_type = 'bill:passed'
                    assert "H" not in chambers_passed
                    chambers_passed.add("H")
                elif actor == 'upper' and \
                        any(action['FullStatus'].startswith(x) for x in (
                        "Read 3rd time & passed",
                        "Read & adopted", "Adopted")):
                    action_type = 'bill:passed'
                    assert "S" not in chambers_passed
                    chambers_passed.add("S")
                else:
                    action_type = 'other'

                bill.add_action(
                        actor=actor,
                        action=re.sub(HTML_TAGS_RE, "", action['FullStatus']),
                        date=datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'),
                        type=action_type
                        )

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\
                        format(year_slug, roll_call_id)
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_other = []
                for member in roll_call:
                    (member_name, _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()
                    
                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_other.append(member_name)

                if "Passed -- " in vote['FullStatus']:
                    did_pass = True
                elif "Failed -- " in vote['FullStatus']:
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = \
                        int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = \
                        int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = Vote(
                        chamber=(
                                'lower' if vote['ChamberCode'] == 'H'
                                else 'upper'
                                ),
                        date=datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'),
                        motion=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(),
                        passed=did_pass,
                        yes_count=yea_count,
                        no_count=nay_count,
                        other_count=len(roll_call_other)
                        )
                vote_to_add.add_source(roll_call_url)

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_other:
                    vote_to_add.other(member)

                try:
                    vote_to_add.validate()
                except ValueError as e:
                    self.warning(e)

                bill.add_vote(vote_to_add)

            # Capture extra information
            # This is not in the OpenStates spec, but is available
            # Not yet implemented
            # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            self.save_bill(bill)
Ejemplo n.º 19
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.urlopen(url).replace(u'\xa0', ' '))

        re_ns = "http://exslt.org/regular-expressions"
        path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={'re': re_ns}):
            if 'HOUSE' in header.xpath("string()"):
                chamber = 'lower'
                motion_index = 8
            else:
                chamber = 'upper'
                motion_index = 9

            motion = header.xpath(
                "string(following-sibling::p[%d])" % motion_index).strip()
            motion = re.sub(r'\s+', ' ', motion)
            match = re.match(r'^(.*) (PASSED|FAILED)$', motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == 'PASSED'
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ')
            rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r'\d+/\d+/\d+', date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace('\r\n', ' ').strip()
                if "*****" in line:
                    break

                match = re.match(
                    r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING)\s*:\s*(\d+)',
                    line)
                if match:
                    if match.group(1) == 'YEAS':
                        vtype = 'yes'
                    elif match.group(1) == 'NAYS':
                        vtype = 'no'
                    elif match.group(1) == 'VACANT':
                        continue  # skip these
                    else:
                        vtype = 'other'
                    counts[vtype] += int(match.group(2))
                else:
                    for name in line.split('   '):
                        if not name:
                            continue
                        if 'HOUSE BILL' in name or 'SENATE BILL' in name:
                            continue
                        votes[vtype].append(name.strip())

            if passed is None:
                passed = counts['yes'] > (counts['no'] + counts['other'])

            if not motion:
                motion = 'Senate Vote' if chamber == 'upper' else 'House Vote'

            vote = Vote(chamber, date, motion, passed,
                        counts['yes'], counts['no'], counts['other'],
                        rcs_num=rcs)
            vote.validate()

            vote.add_source(url)

            for name in votes['yes']:
                vote.yes(name)
            for name in votes['no']:
                vote.no(name)
            for name in votes['other']:
                vote.other(name)

            bill.add_vote(vote)
Ejemplo n.º 20
0
    def scrape_votes(self, bill, link):
        page = self.urlopen(link)
        page = lxml.html.fromstring(page)
        raw_vote_data = page.xpath(
            "//span[@id='lblVoteData']")[0].text_content()
        raw_vote_data = re.split('\w+? by [\w ]+?\s+-',
                                 raw_vote_data.strip())[1:]
        for raw_vote in raw_vote_data:
            raw_vote = raw_vote.split(
                u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0')
            motion = raw_vote[0]

            vote_date = re.search('(\d+/\d+/\d+)', motion)
            if vote_date:
                vote_date = datetime.datetime.strptime(vote_date.group(),
                                                       '%m/%d/%Y')

            passed = ('Passed' in motion or 'Recommended for passage' in motion
                      or 'Adopted' in raw_vote[1])
            vote_regex = re.compile('\d+$')
            aye_regex = re.compile('^.+voting aye were: (.+) -')
            no_regex = re.compile('^.+voting no were: (.+) -')
            other_regex = re.compile('^.+present and not voting were: (.+) -')
            yes_count = 0
            no_count = 0
            other_count = 0
            ayes = []
            nos = []
            others = []

            for v in raw_vote[1:]:
                v = v.strip()
                if v.startswith('Ayes...') and vote_regex.search(v):
                    yes_count = int(vote_regex.search(v).group())
                elif v.startswith('Noes...') and vote_regex.search(v):
                    no_count = int(vote_regex.search(v).group())
                elif v.startswith(
                        'Present and not voting...') and vote_regex.search(v):
                    other_count += int(vote_regex.search(v).group())
                elif aye_regex.search(v):
                    ayes = aye_regex.search(v).groups()[0].split(', ')
                elif no_regex.search(v):
                    nos = no_regex.search(v).groups()[0].split(', ')
                elif other_regex.search(v):
                    others += other_regex.search(v).groups()[0].split(', ')

            if 'ChamberVoting=H' in link:
                chamber = 'lower'
            else:
                chamber = 'upper'

            vote = Vote(chamber, vote_date, motion, passed, yes_count,
                        no_count, other_count)
            vote.add_source(link)
            for a in ayes:
                vote.yes(a)
            for n in nos:
                vote.no(n)
            for o in others:
                vote.other(o)

            vote.validate()
            bill.add_vote(vote)

        return bill
Ejemplo n.º 21
0
    def scrape_uppper_committee_vote(self, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        (_, motion) = lines[5].split("FINAL ACTION:")
        motion = motion.strip()
        if not motion:
            self.warning("Vote appears to be empty")
            return

        vote_top_row = [
            lines.index(x) for x in lines if
            re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x)][0]
        yea_columns_end = lines[vote_top_row].index("Yea") + len("Yea")
        nay_columns_begin = lines[vote_top_row].index("Nay")

        votes = {'yes': [], 'no': [], 'other': []}
        for line in lines[(vote_top_row + 1):]:
            if line.strip():
                member = re.search(r'''(?x)
                        ^\s+(?:[A-Z\-]+)?\s+  # Possible vote indicator
                        ([A-Z][a-z]+  # Name must have lower-case characters
                        [\w\-\s]+)  # Continue looking for the rest of the name
                        (?:,[A-Z\s]+?)?  # Leadership has an all-caps title
                        (?:\s{2,}.*)?  # Name ends when many spaces are seen
                        ''', line).group(1)
                # Usually non-voting members won't even have a code listed
                # Only a couple of codes indicate an actual vote:
                # "VA" (vote after roll call) and "VC" (vote change)
                did_vote = bool(
                    re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line))
                if did_vote:
                    # Check where the "X" or vote code is on the page
                    vote_column = len(line) - len(line.lstrip())
                    if vote_column <= yea_columns_end:
                        votes['yes'].append(member)
                    elif vote_column >= nay_columns_begin:
                        votes['no'].append(member)
                    else:
                        raise AssertionError(
                            "Unparseable vote found for {0} in {1}:\n{2}".
                            format(member, url, line))
                else:
                    votes['other'].append(member)

            # End loop as soon as no more members are found
            else:
                break

        totals = re.search(
            r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS', text).groups()
        yes_count = int(totals[0])
        no_count = int(totals[1])
        passed = (yes_count > no_count)
        other_count = len(votes['other'])

        vote = Vote('upper', date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)
        vote['yes_votes'] = votes['yes']
        vote['no_votes'] = votes['no']
        vote['other_votes'] = votes['other']

        vote.validate()
        bill.add_vote(vote)
Ejemplo n.º 22
0
    def scrape_lower_committee_votes(self, session_number, bill):
        """
        House committee roll calls are not available on the Senate's
        website. Furthermore, the House uses an internal ID system in
        its URLs, making accessing those pages non-trivial.

        This function will fetch all the House committee votes for the
        given bill, and add the votes to that object.
        """

        house_url = "http://www.myfloridahouse.gov/Sections/Bills/bills.aspx"

        # Keep the digits and all following characters in the bill's ID
        bill_number = re.search(r"^\w+\s(\d+\w*)$", bill["bill_id"]).group(1)

        form = {
            "rblChamber": "B",
            "ddlSession": session_number,
            "ddlBillList": "-1",
            "txtBillNumber": bill_number,
            "ddlSponsor": "-1",
            "ddlReferredTo": "-1",
            "SubmittedByControl": "",
        }
        doc = lxml.html.fromstring(self.post(url=house_url, data=form).text)
        doc.make_links_absolute(house_url)

        (bill_link,) = doc.xpath('//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href')
        bill_doc = self.lxmlize(bill_link)
        links = bill_doc.xpath('//a[text()="See Votes"]/@href')

        for link in links:
            vote_doc = self.lxmlize(link)

            (date,) = vote_doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()')
            date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p").date()

            totals = vote_doc.xpath("//table//table")[-1].text_content()
            totals = re.sub(r"(?mu)\s+", " ", totals).strip()
            (yes_count, no_count, other_count) = [
                int(x)
                for x in re.search(
                    r"(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+" "Total Missed:\s+(\d+)", totals
                ).groups()
            ]
            passed = yes_count > no_count

            (committee,) = vote_doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()')
            (action,) = vote_doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()')
            motion = "{} ({})".format(action, committee)

            vote = Vote("lower", date, motion, passed, yes_count, no_count, other_count)
            vote.add_source(link)

            for member_vote in vote_doc.xpath("//table//table//table//td"):
                if not member_vote.text_content().strip():
                    continue

                (member,) = member_vote.xpath("span[2]//text()")
                (member_vote,) = member_vote.xpath("span[1]//text()")

                if member_vote == "Y":
                    vote.yes(member)
                elif member_vote == "N":
                    vote.no(member)
                elif member_vote == "-":
                    vote.other(member)
                # Parenthetical votes appear to not be counted in the
                # totals for Yea, Nay, _or_ Missed
                elif re.search(r"\([YN]\)", member_vote):
                    continue
                else:
                    raise IndexError("Unknown vote type found: {}".format(member_vote))

            vote.validate()
            bill.add_vote(vote)
Ejemplo n.º 23
0
    def scrape(self, session, chambers):
        HTML_TAGS_RE = r'<.*?>'

        year_slug = session[5:]

        # Load all bills and resolutions via the private API
        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
                'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
                format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.iteritems()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info['BillNumber']))

            # Create the bill using its basic information
            bill = Bill(session=session,
                        bill_id=info['BillNumber'],
                        title=info['Title'],
                        chamber=bill_chamber,
                        type=bill_type)
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = \
                    'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                    format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li')
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                        replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsor(sponsor_type, sponsor_name)

            # Capture bill text versions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a')
            for version in versions:
                bill.add_version(name=version.xpath('text()')[0],
                                 url=version.xpath('@href')[0].replace(
                                     ' ', '%20'),
                                 mimetype='application/pdf')

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format(
                        year_slug), lxml.etree.tostring(doc)).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".\
                        format(info['BillNumber']))
                self.save_bill(bill)
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v.strip() for k, v in action.iteritems()}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'governor'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    assert chambers_passed == set("HS")
                    action_type = 'governor:signed'
                elif actor == 'lower' and \
                        any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')):
                    action_type = 'bill:passed'
                    chambers_passed.add("H")
                elif actor == 'upper' and \
                        any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')):
                    action_type = 'bill:passed'
                    chambers_passed.add("S")
                else:
                    action_type = 'other'

                bill.add_action(actor=actor,
                                action=re.sub(HTML_TAGS_RE, "",
                                              action['FullStatus']),
                                date=datetime.datetime.strptime(
                                    action['StatusDate'], '%m/%d/%Y'),
                                type=action_type)

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\
                        format(year_slug, roll_call_id)
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_other = []
                for member in roll_call:
                    (member_name,
                     _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_other.append(member_name)

                if "Passed -- " in vote['FullStatus']:
                    did_pass = True
                elif "Failed -- " in vote['FullStatus']:
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = \
                        int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = \
                        int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = Vote(chamber=('lower' if vote['ChamberCode']
                                            == 'H' else 'upper'),
                                   date=datetime.datetime.strptime(
                                       vote['StatusDate'], '%m/%d/%Y'),
                                   motion=re.sub(HTML_TAGS_RE, "",
                                                 vote['FullStatus']).strip(),
                                   passed=did_pass,
                                   yes_count=yea_count,
                                   no_count=nay_count,
                                   other_count=len(roll_call_other))
                vote_to_add.add_source(roll_call_url)

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_other:
                    vote_to_add.other(member)

                try:
                    vote_to_add.validate()
                except ValueError as e:
                    self.warning(e)

                bill.add_vote(vote_to_add)

            # Capture extra information
            # This is not in the OpenStates spec, but is available
            # Not yet implemented
            # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            self.save_bill(bill)
Ejemplo n.º 24
0
    def scrape_vote(self, bill, action_text, url):
        doc = lxml.html.fromstring(self.get(url).text)

        # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12"
        if action_text.startswith('Vote - Senate Floor - '):
            action_text = action_text[22:]
            chamber = 'upper'
        elif action_text.startswith('Vote - House Floor - '):
            action_text = action_text[21:]
            chamber = 'lower'

        motion, unused_date = action_text.rsplit(' - ', 1)
        try:
            yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0]
            yes_count = int(yes_count)
            no_count = int(no_count)
        except IndexError:
            self.info(
                "Motion text didn't contain vote totals, will get them from elsewhere"
            )
            yes_count = None
            no_count = None

        if 'Passed' in motion:
            motion = motion.split(' Passed')[0]
            passed = True
        elif 'Adopted' in motion:
            motion = motion.split(' Adopted')[0]
            passed = True
        elif 'Rejected' in motion:
            motion = motion.split(' Rejected')[0]
            passed = False
        elif 'Failed' in motion:
            motion = motion.split(' Failed')[0]
            passed = False
        elif 'Concur' in motion:
            passed = True
        elif 'Floor Amendment' in motion:
            if yes_count and no_count:
                passed = yes_count > no_count
            else:
                passed = None
        elif 'overridden' in motion:
            passed = True
            motion = 'Veto Override'
        else:
            raise Exception('unknown motion: %s' % motion)
        vote = Vote(chamber=chamber,
                    date=None,
                    motion=motion,
                    yes_count=yes_count,
                    no_count=no_count,
                    other_count=None,
                    passed=passed)
        vfunc = None

        nobrs = doc.xpath('//nobr/text()')
        for text in nobrs:
            text = text.replace(u'\xa0', ' ')
            if text.startswith('Calendar Date: '):
                if vote['date']:
                    self.warning('two dates!, skipping rest of bill')
                    break
                vote['date'] = datetime.datetime.strptime(
                    text.split(': ', 1)[1], '%b %d, %Y %H:%M %p')
            elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text:
                yeas, nays, nv, exc, absent = re.match(
                    '(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent',
                    text).groups()
                vote['yes_count'] = int(yeas)
                vote['no_count'] = int(nays)
                vote['other_count'] = int(nv) + int(exc) + int(absent)
            elif 'Voting Yea' in text:
                vfunc = vote.yes
            elif 'Voting Nay' in text:
                vfunc = vote.no
            elif 'Not Voting' in text or 'Excused' in text:
                vfunc = vote.other
            elif vfunc:
                if ' and ' in text:
                    legs = text.split(' and ')
                else:
                    legs = [text]
                for leg in legs:
                    # Strip the occasional asterisk - see #1512
                    leg = leg.rstrip('*')
                    vfunc(leg)

        vote.validate()
        vote.add_source(url)
        bill.add_vote(vote)
Ejemplo n.º 25
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' '))

        re_ns = "http://exslt.org/regular-expressions"
        path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={'re': re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if 'HOUSE' in header.xpath("string()"):
                chamber = 'lower'
                motion_index = 8
            else:
                chamber = 'upper'
                motion_index = 13

            motion = header.xpath(
                "string(following-sibling::p[%d])" % motion_index).strip()
            motion = re.sub(r'\s+', ' ', motion)
            assert motion.strip(), "Motion text not found"
            match = re.match(r'^(.*) (PASSED|FAILED)$', motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == 'PASSED'
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ')
            rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r'\d+/\d+/\d+', date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace('\r\n', ' ').strip()
                if "*****" in line:
                    break

                match = re.match(
                    r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)',
                    line)
                if match:
                    if match.group(1) == 'YEAS' and 'RCS#' not in line:
                        vtype = 'yes'
                        seen_yes = True
                    elif match.group(1) == 'NAYS' and seen_yes:
                        vtype = 'no'
                    elif match.group(1) == 'VACANT':
                        continue  # skip these
                    elif seen_yes:
                        vtype = 'other'
                    if seen_yes and match.group(3).strip():
                        self.logger.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split('   '):
                        if not name:
                            continue
                        if 'HOUSE' in name or 'SENATE ' in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts['yes'] > (counts['no'] + counts['other'])

            vote = Vote(chamber, date, motion, passed,
                        counts['yes'], counts['no'], counts['other'],
                        rcs_num=rcs)
            vote.validate()

            vote.add_source(url)

            for name in votes['yes']:
                vote.yes(name)
            for name in votes['no']:
                if ':' in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes['other']:
                vote.other(name)

            vote.validate()
            bill.add_vote(vote)
Ejemplo n.º 26
0
    def scrape_floor_vote(self, chamber, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        MOTION_INDEX = 4
        TOTALS_INDEX = 6
        VOTE_START_INDEX = 9

        motion = lines[MOTION_INDEX].strip()
        # Sometimes there is no motion name, only "Passage" in the line above
        if (not motion and
                not lines[MOTION_INDEX - 1].startswith("Calendar Page:")):
            motion = lines[MOTION_INDEX - 1]
            MOTION_INDEX -= 1
            TOTALS_INDEX -= 1
            VOTE_START_INDEX -= 1
        else:
            assert motion, "Floor vote's motion name appears to be empty"

        for _extra_motion_line in range(2):
            MOTION_INDEX += 1
            if lines[MOTION_INDEX].strip():
                motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip())
                TOTALS_INDEX += 1
                VOTE_START_INDEX += 1
            else:
                break

        (yes_count, no_count, other_count) = [int(x) for x in re.search(
            r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$',
            lines[TOTALS_INDEX]).groups()]
        passed = (yes_count > no_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in lines[VOTE_START_INDEX:]:
            if not line.strip():
                break

            if " President " in line:
                line = line.replace(" President ", " ")
            elif " Speaker " in line:
                line = line.replace(" Speaker ", " ")

            # Votes follow the pattern of:
            # [vote code] [member name]-[district number]
            for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line):
                vote.yes(member)
            for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line):
                vote.no(member)
            for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line):
                vote.other(member)

        try:
            vote.validate()
        except ValueError:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.logger.info("Votes don't add up; looking for additional ones")
            for line in lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(
                        r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}', line):
                    vote.other(member)

        vote.validate()
        bill.add_vote(vote)
Ejemplo n.º 27
0
    def scrape_lower_committee_votes(self, session_number, bill):
        '''
        House committee roll calls are not available on the Senate's
        website. Furthermore, the House uses an internal ID system in
        its URLs, making accessing those pages non-trivial.

        This function will fetch all the House committee votes for the
        given bill, and add the votes to that object.
        '''

        house_url = 'http://www.myfloridahouse.gov/Sections/Bills/bills.aspx'
        bill_number = ''.join([c for c in bill['bill_id'] if c.isdigit()])
        form = {
            'rblChamber': 'B',
            'ddlSession': session_number,
            'ddlBillList': '-1',
            'txtBillNumber': bill_number,
            'ddlSponsor': '-1',
            'ddlReferredTo': '-1',
            'SubmittedByControl': '',
        }
        doc = lxml.html.fromstring(self.post(url=house_url, data=form).text)
        doc.make_links_absolute(house_url)

        (bill_link, ) = doc.xpath(
            '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href')
        bill_doc = self.lxmlize(bill_link)
        links = bill_doc.xpath('//a[text()="See Votes"]/@href')

        for link in links:
            vote_doc = self.lxmlize(link)

            (date, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()')
            date = datetime.datetime.strptime(
                date, '%m/%d/%Y %I:%M:%S %p').date()

            totals = vote_doc.xpath('//table//table')[-1].text_content()
            totals = re.sub(r'(?mu)\s+', " ", totals).strip()
            (yes_count, no_count, other_count) = [int(x) for x in re.search(
                r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+'
                'Total Missed:\s+(\d+)', totals).groups()]
            passed = yes_count > no_count

            (committee, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()')
            (action, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()')
            motion = "{} ({})".format(action, committee)

            vote = Vote('lower', date, motion, passed, yes_count, no_count,
                        other_count)
            vote.add_source(link)

            for member_vote in vote_doc.xpath('//table//table//table//td'):
                if not member_vote.text_content().strip():
                    continue

                (member, ) = member_vote.xpath('span[2]//text()')
                (member_vote, ) = member_vote.xpath('span[1]//text()')

                if member_vote == "Y":
                    vote.yes(member)
                elif member_vote == "N":
                    vote.no(member)
                elif member_vote == "-":
                    vote.other(member)
                # Parenthetical votes appear to not be counted in the
                # totals for Yea, Nay, _or_ Missed
                elif re.search(r'\([YN]\)', member_vote):
                    continue
                else:
                    raise IndexError("Unknown vote type found: {}".format(
                        member_vote))

            vote.validate()
            bill.add_vote(vote)
Ejemplo n.º 28
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.urlopen(url).replace(u"\xa0", " "))

        re_ns = "http://exslt.org/regular-expressions"
        path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={"re": re_ns}):
            if "HOUSE" in header.xpath("string()"):
                chamber = "lower"
                motion_index = 8
            else:
                chamber = "upper"
                motion_index = 9

            motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip()
            motion = re.sub(r"\s+", " ", motion)
            match = re.match(r"^(.*) (PASSED|FAILED)$", motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == "PASSED"
            else:
                passed = None

            rcs_p = header.xpath("following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ")
            rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r"\d+/\d+/\d+", date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace("\r\n", " ").strip()
                if "*****" in line:
                    break

                match = re.match(
                    r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)", line
                )
                if match:
                    if match.group(1) == "YEAS" and "RCS#" not in line:
                        vtype = "yes"
                        seen_yes = True
                    elif match.group(1) == "NAYS" and seen_yes:
                        vtype = "no"
                    elif match.group(1) == "VACANT":
                        continue  # skip these
                    elif seen_yes:
                        vtype = "other"
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split("   "):
                        if not name:
                            continue
                        if "HOUSE BILL" in name or "SENATE BILL" in name:
                            continue
                        votes[vtype].append(name.strip())

            if passed is None:
                passed = counts["yes"] > (counts["no"] + counts["other"])

            if not motion:
                motion = "Senate Vote" if chamber == "upper" else "House Vote"

            vote = Vote(chamber, date, motion, passed, counts["yes"], counts["no"], counts["other"], rcs_num=rcs)
            vote.validate()

            vote.add_source(url)

            for name in votes["yes"]:
                vote.yes(name)
            for name in votes["no"]:
                if ":" in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes["other"]:
                vote.other(name)

            vote.validate()
            bill.add_vote(vote)
Ejemplo n.º 29
0
    def scrape_vote(self, bill, action_text, url):
        doc = lxml.html.fromstring(self.get(url).text)

        # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12"
        if action_text.startswith('Vote - Senate Floor - '):
            action_text = action_text[22:]
            chamber = 'upper'
        elif action_text.startswith('Vote - House Floor - '):
            action_text = action_text[21:]
            chamber = 'lower'

        motion, unused_date = action_text.rsplit(' - ', 1)
        try:
            yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0]
            yes_count = int(yes_count)
            no_count = int(no_count)
        except IndexError:
            self.info("Motion text didn't contain vote totals, will get them from elsewhere")
            yes_count = None
            no_count = None

        if 'Passed' in motion:
            motion = motion.split(' Passed')[0]
            passed = True
        elif 'Adopted' in motion:
            motion = motion.split(' Adopted')[0]
            passed = True
        elif 'Rejected' in motion:
            motion = motion.split(' Rejected')[0]
            passed = False
        elif 'Failed' in motion:
            motion = motion.split(' Failed')[0]
            passed = False
        elif 'Concur' in motion:
            passed = True
        elif 'Floor Amendment' in motion:
            if yes_count and no_count:
                passed = yes_count > no_count
            else:
                passed = None
        elif 'overridden' in motion:
            passed = True 
            motion = 'Veto Override'
        else:
            raise Exception('unknown motion: %s' % motion)
        vote = Vote(chamber=chamber, date=None, motion=motion,
                    yes_count=yes_count, no_count=no_count,
                    other_count=None, passed=passed)
        vfunc = None

        nobrs = doc.xpath('//nobr/text()')
        for text in nobrs:
            text = text.replace(u'\xa0', ' ')
            if text.startswith('Calendar Date: '):
                if vote['date']:
                    self.warning('two dates!, skipping rest of bill')
                    break
                vote['date'] = datetime.datetime.strptime(text.split(': ', 1)[1], '%b %d, %Y %H:%M %p')
            elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text:
                yeas, nays, nv, exc, absent = re.match('(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups()
                vote['yes_count'] = int(yeas)
                vote['no_count'] = int(nays)
                vote['other_count'] = int(nv) + int(exc) + int(absent)
            elif 'Voting Yea' in text:
                vfunc = vote.yes
            elif 'Voting Nay' in text:
                vfunc = vote.no
            elif 'Not Voting' in text or 'Excused' in text:
                vfunc = vote.other
            elif vfunc:
                if ' and ' in text:
                    a, b = text.split(' and ')
                    vfunc(a)
                    vfunc(b)
                else:
                    vfunc(text)

        vote.validate()
        vote.add_source(url)
        bill.add_vote(vote)