Example #1
0
    def scrape_bill_details(self, url, bill):
        html = self.get(url, retry_on_404=True).text
        doc = lxml.html.fromstring(html)

        # summary sections
        summary = doc.xpath('//h4[starts-with(text(), "SUMMARY")]/following-sibling::p/text()')
        if summary and summary[0].strip():
            bill['summary'] = summary[0].strip()

        # versions
        for va in doc.xpath('//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'):

            # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D
            date, desc = va.text.split(u' \xa0')
            desc.rsplit(' ', 1)[0]              # chop off last part
            link = va.get('href')
            date = datetime.datetime.strptime(date, '%m/%d/%y')

            # budget bills in VA are searchable but no full text available
            if '+men+' in link:
                self.warning('not adding budget version, bill text not available')
            else:
                # VA duplicates reprinted bills, lets keep the original name
                bill.add_version(desc, BASE_URL+link, date=date,
                                 mimetype='text/html',
                                 on_duplicate='use_old')

        # actions
        for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/li'):
            date, action = ali.text_content().split(u' \xa0')
            actor, action = action.split(': ', 1)

            actor = self.actor_map[actor]
            date = datetime.datetime.strptime(date.strip(), '%m/%d/%y')

            # if action ends in (##-Y ##-N) remove that part
            vrematch = self.vote_strip_re.match(action)
            if vrematch:
                action, y, n, o = vrematch.groups()
                vote = Vote(actor, date, action, int(y) > int(n),
                            int(y), int(n), 0)
                vote_url = ali.xpath('a/@href')
                if vote_url:
                    self.parse_vote(vote, vote_url[0])
                    vote.add_source(BASE_URL + vote_url[0])
                # set other count, it isn't provided
                vote['other_count'] = len(vote['other_votes'])
                #vote.validate()
                bill.add_vote(vote)

            # categorize actions
            for pattern, atype in self._action_classifiers:
                if re.match(pattern, action):
                    break
            else:
                atype = 'other'

            # if matched a 'None' atype, don't add the action
            if atype:
                bill.add_action(actor, action, date, type=atype)
Example #2
0
    def scrape_vote(self, bill, date, motion, url):
        page = self.urlopen(url)

        if 'not yet official' in page:
            # Sometimes they link to vote pages before they go live
            return

        page = lxml.html.fromstring(page)

        if url.endswith('Senate'):
            actor = 'upper'
        else:
            actor = 'lower'

        count_path = "string(//td[@align = 'center' and contains(., '%s: ')])"
        yes_count = int(page.xpath(count_path % "Yeas").split()[-1])
        no_count = int(page.xpath(count_path % "Nays").split()[-1])
        other_count = int(page.xpath(count_path % "Non Voting").split()[-1])
        other_count += int(page.xpath(count_path % "Present").split()[-1])

        passed = yes_count > no_count + other_count
        vote = Vote(actor, date, motion, passed, yes_count,
                    no_count, other_count)
        vote.add_source(url)

        xpath = (
            '//*[contains(@class, "ms-standardheader")]/'
            'following-sibling::table')
        divs = page.xpath(xpath)
        votevals = 'yes no other other'.split()
        for (voteval, div) in zip(votevals, divs):
            for a in div.xpath('.//a'):
                getattr(vote, voteval)(a.text_content())
        bill.add_vote(vote)
Example #3
0
    def add_vote(self, bill, chamber, date, line, text):
        votes = re.findall(r'Ayes (\d+)\, Noes (\d+)', text)
        (yes, no) = int(votes[0][0]), int(votes[0][1])

        vtype = 'other'
        for regex, type in motion_classifiers.iteritems():
            if re.match(regex, text):
                vtype = type
                break

        v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype)

        # fetch the vote itself
        link = line.xpath('//a[contains(@href, "/votes/")]')
        if link:
            link = link[0].get('href')
            v.add_source(link)

            filename, resp = self.urlretrieve(link)

            if 'av' in link:
                self.add_house_votes(v, filename)
            elif 'sv' in link:
                self.add_senate_votes(v, filename)

        bill.add_vote(v)
Example #4
0
    def scrape_votes(self, link, chamber, bill):
        with self.urlopen(link) as votes_page_html:
            votes_page = lxml.html.fromstring(votes_page_html)
            page_tables = votes_page.cssselect("table")
            votes_table = page_tables[0]
            votes_elements = votes_table.cssselect("td")
            # Eliminate table headings and unnecessary element
            votes_elements = votes_elements[3 : len(votes_elements)]
            ve = grouper(5, votes_elements)
            for actor, date, name_and_text, name, text in ve:
                if "cow" in text.text_content() or "COW" in text.text_content():
                    continue
                vote_date = dt.datetime.strptime(date.text_content(), "%m/%d/%Y")
                motion_and_votes = text.text_content().lstrip("FINAL VOTE - ")
                motion, sep, votes = motion_and_votes.partition(".")
                if "passed" in votes:
                    passed = True
                else:
                    passed = False

                votes_match = re.search("([0-9]+)-([0-9]+)-?([0-9]+)?", votes)
                yes_count = votes_match.group(1)
                no_count = votes_match.group(2)
                other_count = votes_match.group(3)

                if other_count == None:
                    other_count = 0

                vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count)
                vote.add_source(link)
                bill.add_vote(vote)
Example #5
0
    def parse_vote(self, bill, action, act_chamber, act_date, url,
        re_vote_text = re.compile(r'The question (?:being|to be reconsidered):\s*"(.*?\?)"', re.S),
        re_header=re.compile(r'\d{2}-\d{2}-\d{4}\s{10,}\w{,20} Journal\s{10,}\d{,6}\s{,4}')):

        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        
        if len(doc.xpath('//pre')) < 2:
            return
        
        # Find all chunks of text representing voting reports.
        votes_text_container = doc.xpath('//pre')
        if len(votes_text_container) < 2:
            return
        votes_text = votes_text_container[1].text_content()
        votes_text = re_vote_text.split(votes_text)
        votes_data = zip(votes_text[1::2], votes_text[2::2])

        # Process each.
        for motion, text in votes_data:

            yes = no = other = 0

            tally = re.findall(r'\b([YNEA])[A-Z]+:\s{,3}(\d{,3})', text)
            for vtype, vcount in tally:
                vcount = int(vcount) if vcount != '-' else 0
                if vtype == 'Y':
                    yes = vcount
                elif vtype == 'N':
                    no = vcount
                else:
                    other += vcount

            vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other)

            # In lengthy documents, the "header" can be repeated in the middle
            # of content. This regex gets rid of it.
            vote_lines = re_header.sub('', text)
            vote_lines = vote_lines.split('\r\n')

            vote_type = None
            for vote_list in vote_lines:
                if vote_list.startswith('Yeas: '):
                    vote_list, vote_type = vote_list[6:], vote.yes
                elif vote_list.startswith('Nays: '):
                    vote_list, vote_type = vote_list[6:], vote.no
                elif vote_list.startswith('Excused: '):
                    vote_list, vote_type = vote_list[9:], vote.other
                elif vote_list.startswith('Absent: '):
                    vote_list, vote_type = vote_list[9:], vote.other
                elif vote_list.strip() == '':
                    vote_type = None
                if vote_type:
                    for name in vote_list.split(','):
                        name = name.strip()
                        if name:
                            vote_type(name)

            vote.add_source(url)
            bill.add_vote(vote)
Example #6
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r'Ayes,? (\d+)[,;]\s+N(?:oes|ays),? (\d+)', text)
        (yes, no) = int(votes[0][0]), int(votes[0][1])

        vtype = 'other'
        for regex, type in motion_classifiers.iteritems():
            if re.match(regex, text):
                vtype = type
                break

        v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if 'av' in url:
                self.add_house_votes(v, url)
            elif 'sv' in url:
                self.add_senate_votes(v, url)

        # other count is brute forced
        v['other_count'] = len(v['other_votes'])
        v.validate()
        bill.add_vote(v)
Example #7
0
    def vote(self):
        """Return a billy vote.
        """
        actual_vote_dict = collections.defaultdict(list)
        date = self.date()
        motion = self.motion()
        passed = self.passed()
        counts = self.get_counts()
        yes_count = int(counts.get("Yeas", 0))
        no_count = int(counts.get("Nays", 0))
        vote = Vote(
            self.chamber,
            date,
            motion,
            passed,
            yes_count,
            no_count,
            sum(map(int, counts.values())) - (yes_count + no_count),
            actual_vote=dict(actual_vote_dict),
        )

        for vote_val, voter in self.vote_values():
            getattr(vote, vote_val)(voter)
        vote.add_source(self.url)
        return vote
Example #8
0
    def parse_senate_vote(self, sv_text, url):
        """Sets any overrides and creates the vote instance"""
        overrides = {"ONEILL": "O'NEILL"}
        # Add new columns as they appear to be safe
        vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0)
        vote.add_source(url)
        vote, rowHeads, saneRow = self.parse_visual_grid(vote, sv_text, overrides, sVoteHeader, rDate, 'TOTAL', 'TOTAL')

        # Sanity checks on vote data, checks that the calculated total and listed totals match
        sane={'yes': 0, 'no': 0, 'other':0}
        # Make sure the header row and sanity row are in orde
        sorted_rh = sorted(rowHeads.items(), key=operator.itemgetter(0))
        startCount=-1
        for cell in saneRow:
            if startCount >= 0:
                saneVote = sorted_rh[startCount][1]
                if 'Y' == saneVote[0]:
                    sane['yes'] = int(cell[0])
                elif 'N' == saneVote[0]:
                    sane['no'] = int(cell[0])
                else:
                    sane['other'] += int(cell[0])
                startCount += 1
            elif 'TOTAL' in cell[0]:
                startCount = 0
        # Make sure the parsed vote totals match up with counts in the total field
        if sane['yes'] != vote['yes_count'] or sane['no'] != vote['no_count'] or\
           sane['other'] != vote['other_count']:
                raise ValueError("Votes were not parsed correctly")
        # Make sure the date is a date
        if not isinstance(vote['date'], datetime):
                raise ValueError("Date was not parsed correctly")
        # End Sanity Check
        return vote
Example #9
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Example #10
0
    def scrape_senate_vote(self, bill, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, "text")
        os.remove(path)

        lines = text.split("\n")

        date_match = re.search(r"Date:\s+(\d+/\d+/\d+)", text)
        if not date_match:
            self.log("Couldn't find date on %s" % url)
            return

        time_match = re.search(r"Time:\s+(\d+:\d+:\d+)\s+(AM|PM)", text)
        date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2))
        date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p")
        date = self._tz.localize(date)

        vote_type = None
        yes_count, no_count, other_count = None, None, 0
        votes = []
        for line in lines[21:]:
            line = line.strip()
            if not line:
                continue

            if line.startswith("YEAS"):
                yes_count = int(line.split(" - ")[1])
                vote_type = "yes"
            elif line.startswith("NAYS"):
                no_count = int(line.split(" - ")[1])
                vote_type = "no"
            elif line.startswith("EXCUSED") or line.startswith("NOT VOTING"):
                other_count += int(line.split(" - ")[1])
                vote_type = "other"
            else:
                votes.extend([(n.strip(), vote_type) for n in re.split(r"\s{2,}", line)])

        if yes_count is None or no_count is None:
            self.log("Couldne't find vote counts in %s" % url)
            return

        passed = yes_count > no_count + other_count

        clean_bill_id = fix_bill_id(bill["bill_id"])
        motion_line = None
        for i, line in enumerate(lines):
            if line.strip() == clean_bill_id:
                motion_line = i + 2
        motion = lines[motion_line]
        if not motion:
            self.log("Couldn't find motion for %s" % url)
            return

        vote = Vote("upper", date, motion, passed, yes_count, no_count, other_count)
        vote.add_source(url)

        insert_specific_votes(vote, votes)
        check_vote_counts(vote)

        bill.add_vote(vote)
Example #11
0
    def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid):
        votes = page.xpath("//table")[0]
        rows = votes.xpath(".//tr")[0]
        if rows[0].text_content() == 'Votes:':
            #New webste
            rows = votes.xpath(".//tr")[2]
        yno = rows.xpath(".//td")
        if len(yno) < 3:
            yes = yno[0]
            no, other = None, None
        else:
            yes, no, other = rows.xpath(".//td")[:3]

        def proc_block(obj, typ):
            if obj is None:
                return {
                    "type": None,
                    "count": None,
                    "votes": []
                }
            votes = []
            for vote in obj.xpath(".//br"):
                if vote.tail:
                    vote = vote.tail.strip()
                    if vote:
                        votes.append(vote)
            count = len(votes)
            return {
                "type": typ,
                "count": count,
                "votes": votes
            }

        vote_dict = {
            "yes": proc_block(yes, 'yes'),
            "no": proc_block(no, 'no'),
            "other": proc_block(other, 'other'),
        }

        yes_count = vote_dict['yes']['count']
        no_count = vote_dict['no']['count'] or 0
        other_count = vote_dict['other']['count'] or 0

        vote = Vote(
            actor,
            date,
            motion,
            (yes_count > no_count),
            yes_count,
            no_count,
            other_count,
            _vote_id=uniqid)
        vote.add_source(url)

        for key in vote_dict:
            for voter in vote_dict[key]['votes']:
                getattr(vote, key)(voter)

        bill.add_vote(vote)
Example #12
0
    def scrape_vote(self, bill, action_text, url):
        doc = lxml.html.fromstring(self.urlopen(url))

        date = None
        yes_count = no_count = other_count = None

        # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12"
        if action_text.startswith('Vote - Senate Floor - '):
            action_text = action_text[22:]
            chamber = 'upper'
        elif action_text.startswith('Vote - House Floor - '):
            action_text = action_text[21:]
            chamber = 'lower'

        motion, unused_date = action_text.split(' - ')
        yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0]
        if 'Passed' in motion:
            motion = motion.split(' Passed')[0]
            passed = True
        elif 'Adopted' in motion:
            motion = motion.split(' Adopted')[0]
            passed = True
        elif 'Rejected' in motion:
            motion = motion.split(' Rejected')[0]
            passed = False
        elif 'Floor Amendment' in motion:
            passed = int(yes_count) > int(no_count)
        else:
            raise Exception('unknown motion: %s' % motion)

        vote = Vote(chamber=chamber, date=None, motion=motion,
                    yes_count=int(yes_count), no_count=int(no_count),
                    other_count=0, passed=passed)
        vfunc = None

        nobrs = doc.xpath('//nobr/text()')
        for text in nobrs:
            text = text.replace(u'\xa0', ' ')
            if text.startswith('Calendar Date: '):
                vote['date'] = datetime.datetime.strptime(text.split(': ', 1)[1], '%b %d, %Y %H:%M %p')
            elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text:
                self.debug(text)
                yeas, nays, nv, exc, absent = re.match('(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups()
                vote['yes_count'] = int(yeas)
                vote['no_count'] = int(nays)
                vote['other_count'] = int(nv) + int(exc) + int(absent)
            elif 'Voting Yea' in text:
                vfunc = vote.yes
            elif 'Voting Nay' in text:
                vfunc = vote.no
            elif 'Not Voting' in text or 'Excused' in text:
                vfunc = vote.other
            elif vfunc:
                vfunc(text)

        vote.validate()
        vote.add_source(url)
        bill.add_vote(vote)
Example #13
0
    def scrape_votes(self, session):
        votes = {}
        last_line = []

        for line in self.zf.open('tblrollcallsummary.txt'):
            if line.strip() == "":
                continue

            line = line.split('|')
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning('used bad vote line')
                else:
                    last_line = line
                    self.warning('bad vote line %s' % '|'.join(line))
            session_yr = line[0]
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            present = int(line[7])
            absent = int(line[8])
            motion = line[11].strip()

            if session_yr == session and bill_id in self.bills_by_id:
                actor = 'lower' if body == 'H' else 'upper'
                time = datetime.datetime.strptime(timestamp,
                                                  '%m/%d/%Y %H:%M:%S %p')
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(actor, time, motion, passed, yeas, nays, 
                            other_count=0)
                votes[body+vote_num] = vote
                self.bills_by_id[bill_id].add_vote(vote)

        for line in self.zf.open('tblrollcallhistory.txt'):
            session_yr, body, v_num, employee, bill_id, vote = line.split('|')

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                leg = self.legislators[employee]['name']
                vote = vote.strip()
                if not body+v_num in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % ( body+v_num ) )
                    continue

                #code = self.legislators[employee]['seat']
                if vote == 'Yea':
                    votes[body+v_num].yes(leg)
                elif vote == 'Nay':
                    votes[body+v_num].no(leg)
                else:
                    votes[body+v_num].other(leg)
                    votes[body+v_num]['other_count'] += 1
Example #14
0
    def scrape_current(self, chamber, term):
        chamber_name = "Senate" if chamber == "upper" else "House"
        with self.urlopen(
            ksapi.url + "bill_status/"
        ) as bill_request:  # perhaps we should save this data so we can make on request for both chambers?
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json["content"]
            for bill_data in bills:
                # filtering out other chambers
                bill_equal_chamber = False
                for history in bill_data["HISTORY"]:
                    if history["chamber"] == chamber_name:
                        bill_is_in_chamber = True
                if not bill_is_in_chamber:
                    continue

                    # main
                bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"])
                bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower())
                if bill_data["LONGTITLE"]:
                    bill.add_title(bill_data["LONGTITLE"])
                bill.add_document("apn", ksapi.ksleg + bill_data["apn"])
                bill.add_version("Latest", ksapi.ksleg + bill_data["apn"])

                for sponsor in bill_data["SPONSOR_NAMES"]:
                    bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor)

                for event in bill_data["HISTORY"]:
                    if "committee_names" in event and "conferee_names" in event:
                        actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"])
                    elif "committee_names" in history:
                        actor = " and ".join(bill_data["committee_names"])
                    elif "conferee_names" in history:
                        actor = " and ".join(bill_data["conferee_names"])
                    else:
                        actor = "upper" if chamber == "Senate" else "lower"

                    date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S")
                    bill.add_action(actor, event["status"], date)

                    if event["action_code"] in ksapi.voted:
                        votes = votes_re.match(event["status"])
                        if votes:
                            vote = Vote(
                                chamber,
                                date,
                                votes.group(1),
                                event["action_code"] in ksapi.passed,
                                int(votes.group(2)),
                                int(votes.group(3)),
                                0,
                            )
                            vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower())
                            bill.add_vote(vote)

                self.save_bill(bill)
Example #15
0
    def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date,
                    action_text):
        url = ('http://alisondb.legislature.state.al.us/Alison/'
               'GetRollCallVoteResults.aspx?'
               'VOTE={0}&BODY={1}&INST={2}&SESS={3}'.
               format(vote_id, vote_chamber, bill_id, self.session_id))
        doc = lxml.html.fromstring(self.get(url=url).text)

        voters = {'Y': [], 'N': [], 'P': [], 'A': []}

        voters_and_votes = doc.xpath('//table/tr/td/font/text()')
        capture_vote = False
        name = ''
        for item in voters_and_votes:
            if capture_vote:
                capture_vote = False
                if name:
                    voters[item].append(name)
            else:
                capture_vote = True
                name = item
                if (name.endswith(", Vacant") or
                        name.startswith("Total ") or
                        not name.strip()):
                    name = ''

        # Check name counts against totals listed on the site
        total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()')
        if total_yea:
            total_yea = int(total_yea[0].split(":")[-1])
            assert total_yea == len(voters['Y']), "Yea count incorrect"
        else:
            total_yea = len(voters['Y'])

        total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()')
        if total_nay:
            total_nay = int(total_nay[0].split(":")[-1])
            assert total_nay == len(voters['N']), "Nay count incorrect"
        else:
            total_nay = len(voters['N'])

        total_absent = doc.xpath(
            '//*[starts-with(text(), "Total Absent")]/text()')
        if total_absent:
            total_absent = int(total_absent[0].split(":")[-1])
            assert total_absent == len(voters['A']), "Absent count incorrect"
        total_other = len(voters['P']) + len(voters['A'])

        vote = Vote(
            self.CHAMBERS[vote_chamber[0]], vote_date, action_text,
            total_yea > total_nay, total_yea, total_nay, total_other)
        vote.add_source(url)
        for member in voters['Y']:
            vote.yes(member)
        for member in voters['N']:
            vote.no(member)
        for member in (voters['A'] + voters['P']):
            vote.other(member)

        bill.add_vote(vote)
Example #16
0
    def scrape_vote(self, bill, date, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            header = page.xpath("string(//h4[contains(@id, 'hdVote')])")

            location = header.split(', ')[1]

            if location.startswith('House'):
                chamber = 'lower'
            elif location.startswith('Senate'):
                chamber = 'upper'
            else:
                raise ScrapeError("Bad chamber: %s" % chamber)

            committee = ' '.join(location.split(' ')[1:]).strip()
            if not committee or committee.startswith('of Representatives'):
                committee = None

            motion = ', '.join(header.split(', ')[2:]).strip()

            yes_count = int(
                page.xpath("string(//td[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//td[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//td[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//td[contains(@id, 'tdAbsent')])"))
            other_count = excused_count + absent_count

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                        other_count)
            vote['type'] = type

            if committee:
                vote['committee'] = committee

            vote.add_source(url)

            for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"):
                if td.text == 'Yea':
                    vote.yes(td.getprevious().text.strip())
                elif td.text == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif td.text in ('Excused', 'Absent'):
                    vote.other(td.getprevious().text.strip())

            bill.add_vote(vote)
Example #17
0
    def scrape_vote(self, bill, vote_type_id, vote_type):
        base_url = "http://dcclims1.dccouncil.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s"
        url = base_url % (vote_type_id, bill["bill_id"])

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            vote_date = convert_date(doc.get_element_by_id("VoteDate").text)

            # check if voice vote / approved boxes have an 'x'
            voice = doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] == "x"
            passed = doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0] == "x"

            yes_count = extract_int(doc.xpath('//span[@id="VoteCount1"]/b/text()')[0])
            no_count = extract_int(doc.xpath('//span[@id="VoteCount2"]/b/text()')[0])
            # every now and then this actually drops below 0 (error in count)
            other_count = max(13 - (yes_count + no_count), 0)

            vote = Vote("upper", vote_date, vote_type, passed, yes_count, no_count, other_count, voice_vote=voice)

            vote.add_source(url)

            # members are only text on page in a <u> tag
            for member_u in doc.xpath("//u"):
                member = member_u.text
                vote_text = member_u.xpath("../../i/text()")[0]
                if "Yes" in vote_text:
                    vote.yes(member)
                elif "No" in vote_text:
                    vote.no(member)
                else:
                    vote.other(member)
        bill.add_vote(vote)
Example #18
0
    def scrape_votes(self, bill, bill_type, number, session):
        vote_url = ('http://www.legislature.state.oh.us/votes.cfm?ID=' +
                    session + '_' + bill_type + '_' + str(number))

        with self.urlopen(vote_url) as page:
            page = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            for jlink in page.xpath("//a[contains(@href, 'JournalText')]"):
                date = datetime.datetime.strptime(jlink.text,
                                                  "%m/%d/%Y").date()

                details = jlink.xpath("string(../../../td[2])")

                chamber = details.split(" - ")[0]
                if chamber == 'House':
                    chamber = 'lower'
                elif chamber == 'Senate':
                    chamber = 'upper'
                else:
                    raise ScrapeError("Bad chamber: %s" % chamber)

                motion = details.split(" - ")[1].split("\n")[0].strip()

                vote_row = jlink.xpath("../../..")[0].getnext()

                yea_div = vote_row.xpath(
                    "td/font/div[contains(@id, 'Yea')]")[0]
                yeas = []
                for td in yea_div.xpath("table/tr/td"):
                    name = td.xpath("string()")
                    if name:
                        yeas.append(name)

                no_div = vote_row.xpath(
                    "td/font/div[contains(@id, 'Nay')]")[0]
                nays = []
                for td in no_div.xpath("table/tr/td"):
                    name = td.xpath("string()")
                    if name:
                        nays.append(name)

                yes_count = len(yeas)
                no_count = len(nays)

                vote = Vote(chamber, date, motion, yes_count > no_count,
                            yes_count, no_count, 0)

                for yes in yeas:
                    vote.yes(yes)
                for no in nays:
                    vote.no(no)

                bill.add_vote(vote)
Example #19
0
    def scrape_vote(self, bill, name, url):
        if "VOTE/H" in url:
            vote_chamber = "lower"
            cols = (1, 5, 9, 13)
            name_offset = 3
            yes_offset = 0
            no_offset = 1
        else:
            vote_chamber = "upper"
            cols = (1, 6)
            name_offset = 4
            yes_offset = 1
            no_offset = 2

        # Connecticut's SSL is causing problems with Scrapelib, so use Requests
        page = requests.get(url, verify=False).text

        if "BUDGET ADDRESS" in page:
            return

        page = lxml.html.fromstring(page)

        yes_count = page.xpath("string(//span[contains(., 'Those voting Yea')])")
        yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1))

        no_count = page.xpath("string(//span[contains(., 'Those voting Nay')])")
        no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1))

        other_count = page.xpath("string(//span[contains(., 'Those absent')])")
        other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1))

        need_count = page.xpath("string(//span[contains(., 'Necessary for')])")
        need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1))

        date = page.xpath("string(//span[contains(., 'Taken on')])")
        date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1)
        date = date.replace(" ", "")
        date = datetime.datetime.strptime(date + " " + bill["session"], "%m/%d %Y").date()

        vote = Vote(vote_chamber, date, name, yes_count > need_count, yes_count, no_count, other_count)
        vote.add_source(url)

        table = page.xpath("//table")[0]
        for row in table.xpath("tr"):
            for i in cols:
                name = row.xpath("string(td[%d])" % (i + name_offset)).strip()

                if not name or name == "VACANT":
                    continue

                if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)):
                    vote.yes(name)
                elif "N" in row.xpath("string(td[%d])" % (i + no_offset)):
                    vote.no(name)
                else:
                    vote.other(name)

        bill.add_vote(vote)
Example #20
0
    def scrape_vote(self, bill, date, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            header = page.xpath("string(//h4[contains(@id, 'hdVote')])")

            location = header.split(", ")[1]

            if location.startswith("House"):
                chamber = "lower"
            elif location.startswith("Senate"):
                chamber = "upper"
            else:
                raise ScrapeError("Bad chamber: %s" % chamber)

            committee = " ".join(location.split(" ")[1:]).strip()
            if not committee or committee.startswith("of Representatives"):
                committee = None

            motion = ", ".join(header.split(", ")[2:]).strip()
            if not motion:
                # If we can't detect a motion, skip this vote
                return

            yes_count = int(page.xpath("string(//td[contains(@id, 'tdAyes')])"))
            no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])"))
            excused_count = int(page.xpath("string(//td[contains(@id, 'tdExcused')])"))
            absent_count = int(page.xpath("string(//td[contains(@id, 'tdAbsent')])"))
            other_count = excused_count + absent_count

            passed = yes_count > no_count

            if motion.startswith("Do Pass"):
                type = "passage"
            elif motion == "Concurred in amendments":
                type = "amendment"
            elif motion == "Veto override":
                type = "veto_override"
            else:
                type = "other"

            vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count)
            vote["type"] = type

            if committee:
                vote["committee"] = committee

            vote.add_source(url)

            for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"):
                if td.text == "Yea":
                    vote.yes(td.getprevious().text.strip())
                elif td.text == "Nay":
                    vote.no(td.getprevious().text.strip())
                elif td.text in ("Excused", "Absent"):
                    vote.other(td.getprevious().text.strip())

            bill.add_vote(vote)
Example #21
0
    def scrape_vote(self, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath('//td/b/font[text()="MOTION:"]/../../following-sibling::td/font/text()')[0]
        except:
            self.warning("Vote Summary Page Broken ")
            return

        if 'withdrawn' not in motion:
            # Every table row after the one with VOTE in a td/div/b/font
            rolls = page.xpath('//tr[preceding-sibling::tr/td/div/b/font/text()="VOTE"]')

            count_row = rolls[-1]
            yes_count = count_row.xpath('.//b/font[normalize-space(text())="YES:"]'
                                        '/../following-sibling::font[1]/text()')[0]
            no_count = count_row.xpath('.//b/font[normalize-space(text())="NO:"]'
                                       '/../following-sibling::font[1]/text()')[0]
            exc_count = count_row.xpath('.//b/font[normalize-space(text())="EXC:"]'
                                        '/../following-sibling::font[1]/text()')[0]
            nv_count = count_row.xpath('.//b/font[normalize-space(text())="ABS:"]'
                                       '/../following-sibling::font[1]/text()')[0]

            if count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]'
                               '/../following-sibling::b[1]/font/text()'):
                final = count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]'
                                        '/../following-sibling::b[1]/font/text()')[0]
                passed = True if 'pass' in final.lower() or int(yes_count) > int(no_count) else False
            elif 'passed without objection' in motion.lower():
                passed = True
                yes_count = int(len(rolls[:-2]))
            else:
                self.warning("No vote breakdown found for %s" % vote_url)
                return


            other_count = int(exc_count) + int(nv_count)

            vote = Vote(chamber, date, motion, passed,
                        int(yes_count), int(no_count), int(other_count))

            for roll in rolls[:-2]:
                voter = roll.xpath('td[2]/div/font')[0].text_content()
                voted = roll.xpath('td[3]/div/font')[0].text_content().strip()
                if voted:
                    if 'Yes' in voted:
                        vote.yes(voter)
                    elif 'No' in voted:
                        vote.no(voter)
                    else:
                        vote.other(voter)
                elif 'passed without objection' in motion.lower() and voter:
                    vote.yes(voter)

            bill.add_vote(vote)
Example #22
0
    def scrape_votes(self, bill, votes_url):
        html = self.urlopen(votes_url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(votes_url)

        EXPECTED_VOTE_CODES = ['Y','N','E','NV','A','P','-']

        # vote indicator, a few spaces, a name, newline or multiple spaces
        VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})')

        for link in doc.xpath('//a[contains(@href, "votehistory")]'):

            pieces = link.text.split(' - ')
            date = pieces[-1]
            if len(pieces) == 3:
                motion = pieces[1]
            else:
                motion = 'Third Reading'

            chamber = link.xpath('../following-sibling::td/text()')[0]
            if chamber == 'HOUSE':
                chamber = 'lower'
            elif chamber == 'SENATE':
                chamber = 'upper'
            else:
                self.warning('unknown chamber %s' % chamber)

            date = datetime.datetime.strptime(date, "%A, %B %d, %Y")

            # download the file
            fname, resp = self.urlretrieve(link.get('href'))
            pdflines = convert_pdf(fname, 'text').splitlines()
            os.remove(fname)

            vote = Vote(chamber, date, motion.strip(), False, 0, 0, 0)

            for line in pdflines:
                for match in VOTE_RE.findall(line):
                    vcode, name = match
                    if vcode == 'Y':
                        vote.yes(name)
                    elif vcode == 'N':
                        vote.no(name)
                    else:
                        vote.other(name)

            # fake the counts
            vote['yes_count'] = len(vote['yes_votes'])
            vote['no_count'] = len(vote['no_votes'])
            vote['other_count'] = len(vote['other_votes'])
            vote['passed'] = vote['yes_count'] > vote['no_count']
            vote.add_source(link.get('href'))

            bill.add_vote(vote)
Example #23
0
    def parse_vote(self, bill, action, act_chamber, act_date, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        yes = no = other = 0

        tally = re.findall('(?:(Y|N|E|A)(-|\d+)\s*)', action)

        for vtype, vcount in tally:
            vcount = int(vcount) if vcount != '-' else 0
            if vtype == 'Y':
                yes = vcount
            elif vtype == 'N':
                no = vcount
            else:
                other += vcount

        # regex against plain html for motion
        try:
            motion = re.findall('The question being:\s*"(.*)\?"', html,
                                re.DOTALL)[0].replace('\r\n', ' ')
        except IndexError:
            return

        vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other)

        #vote_lines =  doc.xpath('//b[contains(text(), "YEAS:")]')[0].tail.split('\r\n')
        vote_lines = doc.xpath('//pre')[1].text_content().split('\r\n')
        vote_type = None
        for vote_list in vote_lines:
            if vote_list.startswith('Yeas: '):
                vote_list, vote_type = vote_list[6:], vote.yes
            elif vote_list.startswith('Nays: '):
                vote_list, vote_type = vote_list[6:], vote.no
            elif vote_list.startswith('Excused: '):
                vote_list, vote_type = vote_list[9:], vote.other
            elif vote_list.startswith('Absent: '):
                vote_list, vote_type = vote_list[9:], vote.other
            elif vote_list.strip() == '':
                vote_type = None
            if vote_type:
                for name in vote_list.split(','):
                    name = name.strip()
                    if name:
                        vote_type(name)

        vote.add_source(url)
        bill.add_vote(vote)
Example #24
0
    def vote(self):
        '''Return a billy vote.
        '''
        actual_vote_dict = collections.defaultdict(list)
        vote = Vote('lower', self.date(), self.motion(),
                    self.passed(), 0, 0, 0,
                    actual_vote=dict(actual_vote_dict))

        for (vote_val, count), (actual_vote, _), text in self._parse():
            vote[vote_val + '_count'] = count
            for name in filter(None, PlaintextColumns(text)):
                actual_vote_dict[actual_vote].append(name)
                getattr(vote, vote_val)(name)

        vote.add_source(self.url)
        return vote
Example #25
0
    def _parse_senate_votes(self, vote_data):
        vote_datetime = datetime.datetime.strptime(vote_data['voteDate'],
            '%Y-%m-%d')

        vote = Vote(
            chamber='upper',
            date=vote_datetime.date(),
            motion='[No motion available.]',
            passed=False,
            yes_votes=[],
            no_votes=[],
            other_votes=[],
            yes_count=0,
            no_count=0,
            other_count=0)

        if vote_data['voteType'] == 'FLOOR':
            vote['motion'] = 'Floor Vote'
        elif vote_data['voteType'] == 'COMMITTEE':
            vote['motion'] = '{} Vote'.format(vote_data['committee']['name'])
        else:
            raise ValueError('Unknown vote type encountered.')

        vote_rolls = vote_data['memberVotes']['items']

        # Count all yea votes.
        if 'items' in vote_rolls.get('AYE', {}):
            for legislator in vote_rolls['AYE']['items']:
                vote.yes(legislator['fullName'])
                vote['yes_count'] += 1
        if 'items' in vote_rolls.get('AYEWR', {}):
            for legislator in vote_rolls['AYEWR']['items']:
                vote.yes(legislator['fullName'])
                vote['yes_count'] += 1

        # Count all nay votes.
        if 'items' in vote_rolls.get('NAY', {}):
            for legislator in vote_rolls['NAY']['items']:
                vote.no(legislator['fullName'])
                vote['no_count'] += 1

        # Count all other types of votes.
        other_vote_types = ('EXC', 'ABS', 'ABD')
        for vote_type in other_vote_types:
            if vote_rolls.get(vote_type, []):
                for legislator in vote_rolls[vote_type]['items']:
                    vote.other(legislator['fullName'])
                    vote['other_count'] += 1

        vote['passed'] = vote['yes_count'] > vote['no_count']

        return vote
Example #26
0
    def scrape_vote(self, bill, chamber, url):
        page = self.urlopen(url)
        if 'There are no details available for this roll call' in page:
            return
        page = page.replace('&nbsp;', ' ')
        page = lxml.html.fromstring(page)

        info_row = page.xpath("//table[1]/tr[2]")[0]

        date = info_row.xpath("string(td[1])")
        date = datetime.datetime.strptime(date, "%m/%d/%Y")

        motion = info_row.xpath("string(td[2])")
        yes_count = int(info_row.xpath("string(td[3])"))
        no_count = int(info_row.xpath("string(td[4])"))
        other_count = int(info_row.xpath("string(td[5])"))
        passed = info_row.xpath("string(td[6])") == 'Pass'

        if motion == 'Shall the bill pass?':
            type = 'passage'
        elif motion == 'Shall the bill be read the third time?':
            type = 'reading:3'
        elif 'be amended as' in motion:
            type = 'amendment'
        else:
            type = 'other'

        vote = Vote(chamber, date, motion, passed,
                    yes_count, no_count, other_count)
        vote.add_source(url)

        for tr in page.xpath("//table[1]/tr")[3:]:
            if len(tr.xpath("td")) != 2:
                continue


            # avoid splitting duplicate names
            name = tr.xpath("string(td[1])").strip()
            if not name.startswith(DOUBLED_NAMES):
                name = name.split(' of')[0]

            type = tr.xpath("string(td[2])").strip()
            if type.startswith('Yea'):
                vote.yes(name)
            elif type.startswith('Nay'):
                vote.no(name)
            elif type.startswith('Not Voting'):
                pass
            else:
                vote.other(name)

        bill.add_vote(vote)
Example #27
0
    def scrape_vote(self, bill, motion, url):
        page = self.urlopen(url, retry_on_404=True)
        page = lxml.html.fromstring(page)

        yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0]
        yes_count = int(yeas_cell.xpath("string(following-sibling::td)"))

        nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0]
        no_count = int(nays_cell.xpath("string(following-sibling::td)"))

        abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0]
        abs_count = int(abs_cell.xpath("string(following-sibling::td)"))

        ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0]
        ex_count = int(ex_cell.xpath("string(following-sibling::td)"))

        other_count = abs_count + ex_count

        if 'chamber=House' in url:
            chamber = 'lower'
        elif 'chamber=Senate' in url:
            chamber = 'upper'

        date_cell = page.xpath("//td[text() = 'Date:']")[0]
        date = date_cell.xpath("string(following-sibling::td)")
        try:
            date = datetime.datetime.strptime(date, "%B %d, %Y")
        except ValueError:
            date = datetime.datetime.strptime(date, "%b. %d, %Y")

        outcome_cell = page.xpath("//td[text()='Outcome:']")[0]
        outcome = outcome_cell.xpath("string(following-sibling::td)")

        vote = Vote(chamber, date, motion,
                    outcome == 'PREVAILS',
                    yes_count, no_count, other_count)
        vote.add_source(url)

        member_cell = page.xpath("//td[text() = 'Member']")[0]
        for row in member_cell.xpath("../../tr")[1:]:
            name = row.xpath("string(td[2])")
            # name = name.split(" of ")[0]

            vtype = row.xpath("string(td[4])")
            if vtype == 'Y':
                vote.yes(name)
            elif vtype == 'N':
                vote.no(name)
            elif vtype == 'X' or vtype == 'E':
                vote.other(name)

        bill.add_vote(vote)
Example #28
0
    def parse_vote(self, bill, actor, date, motion, url, uniqid):
        page = self.get(url).text
        bill.add_source(url)
        vote_re = re.compile(
            "YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" "(.*)ABSENT( OR NOT VOTING)? -?\s?" "(\d+)(.*)",
            re.MULTILINE | re.DOTALL,
        )
        match = vote_re.search(page)
        yes_count = int(match.group(1))
        no_count = int(match.group(3))
        other_count = int(match.group(6))

        if yes_count > no_count:
            passed = True
        else:
            passed = False

        if actor == "upper" or actor == "lower":
            vote_chamber = actor
            vote_location = ""
        else:
            vote_chamber = ""
            vote_location = actor

        vote = Vote(
            vote_chamber,
            date,
            motion,
            passed,
            yes_count,
            no_count,
            other_count,
            location=vote_location,
            _vote_id=uniqid,
        )
        vote.add_source(url)

        yes_votes = re.split("\s{2,}", match.group(2).strip())
        no_votes = re.split("\s{2,}", match.group(4).strip())
        other_votes = re.split("\s{2,}", match.group(7).strip())

        for yes in yes_votes:
            if yes:
                vote.yes(yes)
        for no in no_votes:
            if no:
                vote.no(no)
        for other in other_votes:
            if other:
                vote.other(other)

        bill.add_vote(vote)
Example #29
0
    def scrape_votes(self, session):
        votes = {}
        last_line = []

        for line in self.zf.open("tblrollcallsummary.txt"):
            line = line.split("|")
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning("used bad vote line")
                else:
                    last_line = line
                    self.warning("bad vote line %s" % "|".join(line))
            session_yr = line[0]
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            present = int(line[7])
            absent = int(line[8])
            motion = line[11].strip()

            if session_yr == session and bill_id in self.bills_by_id:
                actor = "lower" if body == "H" else "upper"
                time = datetime.datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S %p")
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(actor, time, motion, passed, yeas, nays, absent)
                votes[body + vote_num] = vote
                self.bills_by_id[bill_id].add_vote(vote)

        for line in self.zf.open("tblrollcallhistory.txt"):
            session_yr, body, v_num, employee, bill_id, vote = line.split("|")

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                leg = self.legislators[employee]["name"]
                vote = vote.strip()
                # code = self.legislators[employee]['seat']
                if vote == "Yea":
                    votes[body + v_num].yes(leg)
                elif vote == "Nay":
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].other(leg)
Example #30
0
    def parse_vote(self, actor, date, row):
        """
        takes the actor, date and row element and returns a Vote object
        """
        spans = row.xpath('.//span')
        motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip()
        motion = motion if motion else "passage"
        passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3)
        yes_votes = self.get_names(spans[1].tail)
        no_votes = self.get_names(spans[2].tail)

        other_votes = []
        for span in spans[3:]:
            if span.text.startswith(('Absent', 'Excused')):
                other_votes += self.get_names(span.tail)
        for key, val in {'adopted': True, 'passed': True, 'failed': False}.items():
            if key in passed.lower():
                passed = val
                break
        vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count),
                    int(other_count))
        for name in yes_votes:
            if name and name != 'None':
                vote.yes(name)
        for name in no_votes:
            if name and name != 'None':
                vote.no(name)
        for name in other_votes:
            if name and name != 'None':
                vote.other(name)
        return vote
Example #31
0
    def parse_senate_vote(self, url):
        """ senate PDFs -> garbled text -> good text -> Vote """
        vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0)
        vote.add_source(url)

        fname, resp = self.urlretrieve(url)
        # this gives us the cleaned up text
        sv_text = convert_sv_text(convert_pdf(fname, 'text'))
        os.remove(fname)
        in_votes = False

        # use in_votes as a sort of state machine
        for line in sv_text:

            # not 'in_votes', get date or passage
            if not in_votes:
                dmatch = re.search('DATE:(\d{2}-\d{2}-\d{2})', line)
                if dmatch:
                    date = dmatch.groups()[0]
                    vote['date'] = datetime.strptime(date, '%m-%d-%y')

                if 'YES NO ABS EXC' in line:
                    in_votes = True
                elif 'PASSED' in line:
                    vote['passed'] = True

            # in_votes: totals & votes
            else:
                # totals
                if 'TOTALS' in line:

                    # Lt. Governor voted
                    if 'GOVERNOR' in line:
                        name, spaces, line = re.match(' ([A-Z,.]+)(\s+)X(.*)',
                                                      line).groups()
                        if len(spaces) == 1:
                            vote.yes(name)
                        else:
                            vote.no(name)

                    _, yes, no, abs, exc = line.split()
                    vote['yes_count'] = int(yes)
                    vote['no_count'] = int(no)
                    vote['other_count'] = int(abs) + int(exc)
                    # no longer in votes
                    in_votes = False
                    continue

                # pull votes out
                matches = re.match(' ([A-Z,.]+)(\s+)X\s+([A-Z,.]+)(\s+)X',
                                   line).groups()
                name1, spaces1, name2, spaces2 = matches

                # vote can be determined by # of spaces
                if len(spaces1) == 1:
                    vote.yes(name1)
                elif len(spaces1) == 2:
                    vote.no(name1)
                else:
                    vote.other(name1)

                if len(spaces2) == 1:
                    vote.yes(name2)
                elif len(spaces2) == 2:
                    vote.no(name2)
                else:
                    vote.other(name2)
        return vote
Example #32
0
    def scrape_votes(self, bill, link):
        page = self.get(link).text
        page = lxml.html.fromstring(page)
        raw_vote_data = page.xpath(
            "//span[@id='lblVoteData']")[0].text_content()
        raw_vote_data = re.split('\w+? by [\w ]+?\s+-',
                                 raw_vote_data.strip())[1:]
        for raw_vote in raw_vote_data:
            raw_vote = raw_vote.split(
                u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0')
            motion = raw_vote[0]

            vote_date = re.search('(\d+/\d+/\d+)', motion)
            if vote_date:
                vote_date = datetime.datetime.strptime(vote_date.group(),
                                                       '%m/%d/%Y')

            passed = ('Passed' in motion or 'Recommended for passage' in motion
                      or 'Adopted' in raw_vote[1])
            vote_regex = re.compile('\d+$')
            aye_regex = re.compile('^.+voting aye were: (.+) -')
            no_regex = re.compile('^.+voting no were: (.+) -')
            other_regex = re.compile('^.+present and not voting were: (.+) -')
            yes_count = 0
            no_count = 0
            other_count = 0
            ayes = []
            nos = []
            others = []

            for v in raw_vote[1:]:
                v = v.strip()
                if v.startswith('Ayes...') and vote_regex.search(v):
                    yes_count = int(vote_regex.search(v).group())
                elif v.startswith('Noes...') and vote_regex.search(v):
                    no_count = int(vote_regex.search(v).group())
                elif v.startswith(
                        'Present and not voting...') and vote_regex.search(v):
                    other_count += int(vote_regex.search(v).group())
                elif aye_regex.search(v):
                    ayes = aye_regex.search(v).groups()[0].split(', ')
                elif no_regex.search(v):
                    nos = no_regex.search(v).groups()[0].split(', ')
                elif other_regex.search(v):
                    others += other_regex.search(v).groups()[0].split(', ')

            if 'ChamberVoting=H' in link:
                chamber = 'lower'
            else:
                chamber = 'upper'

            vote = Vote(chamber, vote_date, motion, passed, yes_count,
                        no_count, other_count)
            vote.add_source(link)

            seen = set()
            for a in ayes:
                if a in seen:
                    continue
                vote.yes(a)
                seen.add(a)
            for n in nos:
                if n in seen:
                    continue
                vote.no(n)
                seen.add(n)
            for o in others:
                if o in seen:
                    continue
                vote.other(o)
                seen.add(o)

            # vote.validate()
            bill.add_vote(vote)

        return bill
Example #33
0
    def _process_votes(self,rollcalls,bill,proxy):
        result_types = {
            'FAILED': False,
            'DEFEATED': False,
            'PREVAILED': True,
            'PASSED': True,
            'SUSTAINED': True,
            'NOT SECONDED': False,
            'OVERRIDDEN': True,
            'ADOPTED': True,
        }

        for r in rollcalls:
            proxy_link = proxy["url"] + r["link"]
            (path, resp) = self.urlretrieve(proxy_link)
            text = convert_pdf(path, 'text')
            lines = text.split("\n")
            os.remove(path)

            chamber = "lower" if "house of representatives" in lines[0].lower() else "upper"
            date_parts = lines[1].strip().split()[-3:]
            date_str = " ".join(date_parts).title() + " " + lines[2].strip()
            vote_date = datetime.datetime.strptime(date_str,"%b %d, %Y %I:%M:%S %p")

            passed = None

            for res,val in result_types.items():
                # We check multiple lines now because the result of the
                # roll call vote as parsed can potentially be split.
                # PDF documents suck.
                for line in lines[3:5]:
                    if res in line.upper():
                        passed = val
                        break

            if passed is None:
                raise AssertionError("Missing bill passage type")

            motion = " ".join(lines[4].split()[:-2])
            try:
                yeas = int(lines[4].split()[-1])
                nays = int(lines[5].split()[-1])
                excused = int(lines[6].split()[-1])
                not_voting = int(lines[7].split()[-1])
            except ValueError:
                self.logger.warning("Vote format is weird, skipping")
                continue
            other_count = excused + not_voting

            vote = Vote(chamber,vote_date,motion,passed,yeas,nays,other_count,yes_votes=[],no_votes=[],other_votes=[])

            vote.add_source(proxy_link)

            currently_counting = ""

            possible_vote_lines = lines[8:]
            for l in possible_vote_lines:
                l = l.replace("NOT\xc2\xa0VOTING","NOT VOTING")
                l = l.replace("\xc2\xa0"," -")
                if "yea-" in l.lower().replace(" ",""):
                    currently_counting = "yes_votes"
                elif "nay-" in l.lower().replace(" ",""):
                    currently_counting = "no_votes"
                elif "excused-" in l.lower().replace(" ",""):
                    currently_counting = "other_votes"
                elif "notvoting-" in l.lower().replace(" ",""):
                    currently_counting = "other_votes"
                elif currently_counting == "":
                    pass
                elif re.search(r'v\. \d\.\d',l):
                    #this gets rid of the version number
                    #which is often found at the bottom of the doc
                    pass
                else:
                    voters = l.split("  ")
                    for v in voters:
                        if v.strip():
                            vote[currently_counting].append(v.strip())

            if len(vote["yes_votes"]) == vote["yes_count"]:
                self.logger.warning("Yes vote counts ({count}) don't match count of actual votes ({actual}): {url}".format(count=vote["yes_count"],actual=len(vote["yes_votes"]), url=proxy_link))
            if len(vote["no_votes"]) == vote["no_count"]:
                self.logger.warning("No vote counts ({count}) don't match count of actual votes ({actual}): {url}".format(count=vote["no_count"],actual=len(vote["no_votes"]), url=proxy_link))
            if len(vote["other_votes"]) == vote["other_count"]:
                self.logger.warning("Other vote counts ({count}) don't match count of actual votes ({actual}): {url}".format(count=vote["other_count"],actual=len(vote["other_votes"]),url=proxy_link))

            #indiana only has simple majorities even for veto overrides
            #if passage status isn't the same as yes>no, then we should look!
            bill_type = bill['type'][0]

            vote_invalid = False
            # It seems resolutions may be passed without a recorded vote.
            # Don't understand why there's a roll call then, but hey.
            if 'resolution' in bill_type:
                if vote['passed'] != (vote['yes_count'] >= vote['no_count']):
                    vote_invalid = True
            else:
                if vote['passed'] != (vote['yes_count'] > vote['no_count']):
                    vote_invalid = True

            if vote_invalid:
                raise AssertionError('Vote count doesn\'t agree with vote '
                    'passage status.')

            bill.add_vote(vote)
Example #34
0
    def scrape(self, chamber, session):
        chamber_name = 'house' if chamber == 'lower' else 'senate'
        session_slug = {
                '62': '62-2011',
                '63': '63-2013',
                '64': '64-2015',
                '65': '65-2017',
                }[session]

        # Open the index page of the session's Registers, and open each
        url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % (
            session_slug, chamber_name)
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            # Initialize information about the vote parsing
            results = {}
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""

            # Determine which URLs the information was pulled from
            pdf_url = pdf.attrib['href']

            try:
                (path, response) = self.urlretrieve(pdf_url)
            except requests.exceptions.ConnectionError:
                continue

            # Convert the PDF to text
            data = convert_pdf(path, type='text')
            os.unlink(path)

            # Determine the date of the document
            date = re.findall(date_re, data)
            if date:
                date = date[0][0]
                cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y")
            else:
                # If no date is found anywhere, do not process the document
                self.warning("No date was found for the document; skipping.")
                continue

            # Check each line of the text for motion and vote information
            lines = data.splitlines()
            for line in lines:

                # Ignore lines with no information
                if re.search(chamber_re, line) or \
                        re.search(date_re, line) or \
                        re.search(page_re, line) or \
                        line.strip() == "":
                    pass

                # Ensure that motion and vote capturing are not _both_ active
                elif in_motion and in_vote:
                    raise AssertionError(
                            "Scraper should not be simultaneously processing " +
                            "motion name and votes, as it is for this motion: " +
                            cur_motion
                            )

                # Start capturing motion text after a ROLL CALL header
                elif not in_motion and not in_vote:
                    if line.strip() == "ROLL CALL":
                        in_motion = True

                elif in_motion and not in_vote:
                    if cur_motion == "":
                        cur_motion = line.strip()
                    else:
                        cur_motion = cur_motion + " " + line.strip()

                    # ABSENT AND NOT VOTING marks the end of each motion name
                    # In this case, prepare to capture votes
                    if line.strip().endswith("VOTING") or \
                            line.strip().endswith("VOTING."):
                        in_motion = False
                        in_vote = True

                elif not in_motion and in_vote:
                    # Ignore appointments and confirmations
                    if "The Senate advises and consents to the appointment" \
                            in line:
                        in_vote = False
                        cur_vote = None
                        results = {}
                        cur_motion = ""

                    # If votes are being processed, record the voting members
                    elif ":" in line:
                        cur_vote, who = (x.strip() for x in line.split(":", 1))
                        who = [x.strip() for x in who.split(';') if x.strip() != ""]
                        results[cur_vote] = who

                        name_may_be_continued = False if line.endswith(";") \
                                else True
                    
                    elif cur_vote is not None and \
                            not any(x in line.lower() for x in
                            ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']):
                        who = [x.strip() for x in line.split(";") if x.strip() != ""]

                        if name_may_be_continued:
                            results[cur_vote][-1] = results[cur_vote][-1] + \
                                    " " + who.pop(0)

                        name_may_be_continued = False if line.endswith(";") \
                                else True

                        results[cur_vote].extend(who)

                    # At the conclusion of a vote, save its data
                    elif any(x in line.lower() for x in
                            ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']):

                        in_vote = False
                        cur_vote = None

                        # Identify what is being voted on
                        # Throw a warning if impropper informaiton found
                        bills = re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)

                        if bills == [] or cur_motion.strip() == "":
                            results = {}
                            cur_motion = ""
                            self.warning(
                                    "No motion or bill name found: " +
                                    "motion name: " + cur_motion + "; " +
                                    "decision text: " + line.strip()
                                    )
                            continue

                        cur_bill_id = "%s%s%s %s" % (bills[-1])

                        # If votes are found in the motion name, throw an error
                        if "YEAS:" in cur_motion or "NAYS:" in cur_motion:
                            raise AssertionError(
                                    "Vote data found in motion name: " +
                                    cur_motion
                                    )

                        # Use the collected results to determine who voted how
                        keys = {
                            "YEAS": "yes",
                            "NAYS": "no",
                            "ABSENT AND NOT VOTING": "other"
                        }
                        res = {}
                        for key in keys:
                            if key in results:
                                res[keys[key]] = filter(lambda a: a != "", results[key])
                            else:
                                res[keys[key]] = []

                        # Count the number of members voting each way
                        yes, no, other = \
                                len(res['yes']), \
                                len(res['no']), \
                                len(res['other'])
                        chambers = {
                            "H": "lower",
                            "S": "upper",
                            "J": "joint"
                        }

                        # Identify the source chamber for the bill
                        try:
                            bc = chambers[cur_bill_id[0]]
                        except KeyError:
                            bc = 'other'

                        # Determine whether or not the vote passed
                        if "over the governor's veto" in cur_motion.lower():
                            VETO_SUPERMAJORITY = 2 / 3
                            passed = (yes / (yes + no) > VETO_SUPERMAJORITY)
                        else:
                            passed = (yes > no)

                        # Create a Vote object based on the scraped information
                        vote = Vote(chamber,
                                    cur_date,
                                    cur_motion,
                                    passed,
                                    yes,
                                    no,
                                    other,
                                    session=session,
                                    bill_id=cur_bill_id,
                                    bill_chamber=bc)

                        vote.add_source(pdf_url)
                        vote.add_source(url)

                        # For each category of voting members,
                        # add the individuals to the Vote object
                        for key in res:
                            obj = getattr(vote, key)
                            for person in res[key]:
                                obj(person)

                        # Check the vote counts in the motion text against
                        # the parsed results
                        for category_name in keys.keys():
                            # Need to search for the singular, not plural, in the text
                            # so it can find, for example,  " 1 NAY "
                            vote_re = r"(\d+)\s{}".format(category_name[:-1])
                            motion_count = int(re.findall(vote_re, cur_motion)[0])
                            vote_count = vote[keys[category_name] + "_count"]

                            if motion_count != vote_count:
                                self.warning(
                                        "Motion text vote counts ({}) ".format(motion_count) +
                                        "differed from roll call counts ({}) ".format(vote_count) +
                                        "for {0} on {1}".format(category_name, cur_bill_id)
                                        )
                                vote[keys[category_name] + "_count"] = motion_count

                        self.save_vote(vote)

                        # With the vote successfully processed,
                        # wipe its data and continue to the next one
                        results = {}
                        cur_motion = ""
Example #35
0
 def asvote(self):
     v = Vote(**self.asdict())
     for key in 'yes_votes no_votes other_votes'.split():
         v[key] = getattr(self, key)()
     v.add_source(self.url)
     return v
Example #36
0
    def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        descr = page.xpath("//b")[0].text_content()

        if "on voice vote" in descr:
            return

        if "committee" in descr.lower():
            return self.scrape_committee_vote(bill, actor, date, motion, url,
                                              uniqid)

        passed = None

        if "Passed" in descr:
            passed = True
        elif "Failed" in descr:
            passed = False
        elif "UTAH STATE LEGISLATURE" in descr:
            return
        else:
            logger.warning(descr)
            raise NotImplemented("Can't see if we passed or failed")

        headings = page.xpath("//b")[1:]
        votes = page.xpath("//table")
        sets = zip(headings, votes)
        vdict = {}
        for (typ, votes) in sets:
            txt = typ.text_content()
            arr = [x.strip() for x in txt.split("-", 1)]
            if len(arr) != 2:
                continue
            v_txt, count = arr
            v_txt = v_txt.strip()
            count = int(count)
            people = [
                x.text_content().strip()
                for x in votes.xpath(".//font[@face='Arial']")
            ]

            vdict[v_txt] = {"count": count, "people": people}

        vote = Vote(actor,
                    date,
                    motion,
                    passed,
                    vdict['Yeas']['count'],
                    vdict['Nays']['count'],
                    vdict['Absent or not voting']['count'],
                    _vote_id=uniqid)
        vote.add_source(url)

        for person in vdict['Yeas']['people']:
            vote.yes(person)
        for person in vdict['Nays']['people']:
            vote.no(person)
        for person in vdict['Absent or not voting']['people']:
            vote.other(person)

        logger.info(vote)
        bill.add_vote(vote)
Example #37
0
    def scrape_house_vote(self, bill, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        lines = text.split('\n')

        try:
            date = re.search(r'\d\d-\d\d-\d\d', text).group(0)
        except AttributeError:
            self.log("Couldn't find date on %s" % url)
            return
        date = datetime.datetime.strptime(date, "%m-%d-%y")

        votes = []
        yes_count, no_count, other_count = None, None, 0
        vtype = None
        for line in lines[14:]:
            line = line.strip()
            if not line:
                continue

            if line.startswith('VOTING YEA'):
                yes_count = parse_vote_count(line.split(":")[1].strip())
                vtype = 'yes'
            elif line.startswith('VOTING NAY'):
                no_count = parse_vote_count(line.split(":")[1].strip())
                vtype = 'no'
            elif line.startswith('EXCUSED'):
                other_count += parse_vote_count(line.split(":")[1].strip())
                vtype = 'other'
            elif line.startswith('NOT VOTING'):
                other_count += parse_vote_count(line.split(":")[1].strip())
                vtype = 'other'
            else:
                n1 = line[0:19].strip()
                if n1:
                    votes.append((n1, vtype))
                n2 = line[19:40].strip()
                if n2:
                    votes.append((n2, vtype))
                n3 = line[40:58].strip()
                if n3:
                    votes.append((n3, vtype))
                n4 = line[58:].strip()
                if n4:
                    votes.append((n4, vtype))

        result_types = {
            'FAILED': False,
            'DEFEATED': False,
            'PREVAILED': True,
            'PASSED': True,
            'SUSTAINED': True
        }
        passed = re.search(
            r'Roll\s+Call\s+\d+:\s+(%s)' % '|'.join(result_types.keys()),
            text).group(1)
        passed = result_types[passed]

        motion_line = None
        for i, line in enumerate(lines):
            if line.startswith('MEETING DAY'):
                motion_line = i + 7
        motion = re.split(r'\s{2,}', lines[motion_line].strip())[0].strip()
        if not motion:
            self.log("Couldn't find motion for %s" % url)
            return

        vote = Vote('lower', date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        insert_specific_votes(vote, votes)
        check_vote_counts(vote)

        bill.add_vote(vote)
Example #38
0
    def scrape_uppper_committee_vote(self, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        (_, motion) = lines[5].split("FINAL ACTION:")
        motion = motion.strip()
        if not motion:
            self.warning("Vote appears to be empty")
            return

        vote_top_row = [
            lines.index(x) for x in lines
            if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x)
        ][0]
        yea_columns_end = lines[vote_top_row].index("Yea") + len("Yea")
        nay_columns_begin = lines[vote_top_row].index("Nay")

        votes = {'yes': [], 'no': [], 'other': []}
        for line in lines[(vote_top_row + 1):]:
            if line.strip():
                member = re.search(
                    r'''(?x)
                        ^\s+(?:[A-Z\-]+)?\s+  # Possible vote indicator
                        ([A-Z][a-z]+  # Name must have lower-case characters
                        [\w\-\s]+)  # Continue looking for the rest of the name
                        (?:,[A-Z\s]+?)?  # Leadership has an all-caps title
                        (?:\s{2,}.*)?  # Name ends when many spaces are seen
                        ''', line).group(1)
                # Usually non-voting members won't even have a code listed
                # Only a couple of codes indicate an actual vote:
                # "VA" (vote after roll call) and "VC" (vote change)
                did_vote = bool(re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line))
                if did_vote:
                    # Check where the "X" or vote code is on the page
                    vote_column = len(line) - len(line.lstrip())
                    if vote_column <= yea_columns_end:
                        votes['yes'].append(member)
                    elif vote_column >= nay_columns_begin:
                        votes['no'].append(member)
                    else:
                        raise AssertionError(
                            "Unparseable vote found for {0} in {1}:\n{2}".
                            format(member, url, line))
                else:
                    votes['other'].append(member)

            # End loop as soon as no more members are found
            else:
                break

        totals = re.search(r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS',
                           text).groups()
        yes_count = int(totals[0])
        no_count = int(totals[1])
        passed = (yes_count > no_count)
        other_count = len(votes['other'])

        vote = Vote('upper', date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)
        vote['yes_votes'] = votes['yes']
        vote['no_votes'] = votes['no']
        vote['other_votes'] = votes['other']

        vote.validate()
        bill.add_vote(vote)
Example #39
0
    def scrape_journal(self, session, url):
        journal, resp = self.urlretrieve(url)
        text = convert_pdf(journal, type='text')
        lines = text.splitlines()

        #  state machine:
        #      None - undefined state
        #      question_quote - in question, looking for end quote
        #      pre-yes - vote is active, haven't hit yes votes yet
        #      yes     - yes votes
        #      no      - no votes
        #      other   - other votes
        state = None
        vote = None

        for line_num, line in enumerate(lines):
            date_match = DATE_RE.findall(line)

            # skip headers
            if 'LEGISLATIVE JOURNAL' in line:
                continue

            elif date_match:
                date = datetime.datetime.strptime(' '.join(date_match[0]),
                                                  '%B %d %Y')
                continue

            # keep adding lines to question while quotes are open
            elif state == 'question_quote':
                question += ' %s' % line

            elif state in ('pre-yes', 'yes', 'no', 'other'):
                yes_match = YES_RE.match(line)
                no_match = NO_RE.match(line)
                other_match = NOT_VOTING_RE.match(line)
                if yes_match:
                    vote['yes_count'] = int(yes_match.group(1))
                    state = 'yes'
                elif no_match:
                    vote['no_count'] = int(no_match.group(1))
                    state = 'no'
                elif other_match:
                    vote['other_count'] += int(other_match.group(1))
                    state = 'other'
                elif 'having voted in the affirmative' in line:
                    vote['passed'] = True
                    state = None
                    vote.validate()
                    self.save_vote(vote)
                    vote = None
                elif 'Having failed' in line:
                    vote['passed'] = False
                    state = None
                    vote.validate()
                    self.save_vote(vote)
                    vote = None
                elif line:
                    people = re.split('\s{3,}', line)
                    #try:
                    func = {
                        'yes': vote.yes,
                        'no': vote.no,
                        'other': vote.other
                    }[state]
                    #except KeyError:
                    #self.warning('line showed up in pre-yes state: %s',
                    #             line)
                    for p in people:
                        if p:
                            # special case for long name w/ 1 space
                            if p.startswith(('Lautenbaugh ', 'Langemeier ')):
                                p1, p2 = p.split(' ', 1)
                                func(p1)
                                func(p2)
                            else:
                                func(p)

            # check the text against our regexes
            bill_match = BILL_RE.match(line)
            veto_match = VETO_BILL_RE.findall(line)
            question_match = QUESTION_RE.findall(line)
            if bill_match:
                bill_type, bill_id = bill_match.groups()
                if bill_type == 'BILL':
                    bill_id = 'LB ' + bill_id
                elif bill_type == 'RESOLUTION':
                    bill_id = 'LR ' + bill_id
            elif question_match:
                question = question_match[0]
                state = 'question_quote'
            elif veto_match:
                bill_id = veto_match[0]

            # line just finished a question
            if state == 'question_quote' and QUESTION_MATCH_END in question:
                question = re.sub(
                    '\s+', ' ',
                    question.replace(QUESTION_MATCH_END, '').strip())
                # save prior vote
                vote = Vote(bill_id=bill_id,
                            session=session,
                            bill_chamber='upper',
                            chamber='upper',
                            motion=question,
                            type='passage',
                            passed=False,
                            date=date,
                            yes_count=0,
                            no_count=0,
                            other_count=0)
                vote.add_source(url)
                state = 'pre-yes'
                # reset bill_id and question
                bill_id = question = None
Example #40
0
    def parse_vote(self, bill, vote_date, vote_chamber, vote_status, vote_url):
        vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'
        formats = ['%a %d %b %Y',
                   '%b. %d, %Y, %H:%M %p',
                   '%B %d, %Y, %H:%M %p',
                   '%B %d, %Y, %H %p',
                   '%a, %b %d, %Y'
                  ]
        vote_date = vote_date.replace('.m.', 'm')
        for format in formats:
            try:
                vote_date = datetime.datetime.strptime(vote_date, format)
                break
            except ValueError:
                pass
        else:
            raise ValueError("couldn't parse date: " + vote_date)


        vote_doc, resp = self.urlretrieve(vote_url)

        try:
            subprocess.check_call('timeout 10 abiword --to=ksvote.txt %s' % vote_doc,
                                  shell=True, cwd='/tmp/')
        except subprocess.CalledProcessError:
            # timeout failed, some documents hang abiword
            self.error('abiword hung for longer than 10s on conversion')
            return
        vote_lines = open('/tmp/ksvote.txt').readlines()

        os.remove(vote_doc)

        comma_or_and = re.compile(', |\sand\s')
        comma_or_and_jrsr = re.compile(', (?!Sr.|Jr.)|\sand\s')

        vote = None
        passed = True
        for line in vote_lines:
            totals = re.findall('Yeas (\d+)[;,] Nays (\d+)[;,] (?:Present but not voting|Present and Passing):? (\d+)[;,] (?:Absent or not voting|Absent or Not Voting):? (\d+)',
                                line)
            line = line.strip()
            if totals:
                totals = totals[0]
                yeas = int(totals[0])
                nays = int(totals[1])
                nv = int(totals[2])
                absent = int(totals[3])
                # default passed to true
                vote = Vote(vote_chamber, vote_date, vote_status.strip(),
                            True, yeas, nays, nv+absent)
            elif vote and line.startswith('Yeas:'):
                line = line.split(':', 1)[1].strip()
                for member in comma_or_and.split(line):
                    if member != 'None.':
                        vote.yes(member)
            elif vote and line.startswith('Nays:'):
                line = line.split(':', 1)[1].strip()
                # slightly different vote format if Jr stands alone on a line
                if ', Jr.,' in line:
                    regex = comma_or_and_jrsr
                else:
                    regex = comma_or_and
                for member in regex.split(line):
                    if member != 'None.':
                        vote.no(member)
            elif vote and line.startswith('Present '):
                line = line.split(':', 1)[1].strip()
                for member in comma_or_and.split(line):
                    if member != 'None.':
                        vote.other(member)
            elif vote and line.startswith('Absent or'):
                line = line.split(':', 1)[1].strip()
                for member in comma_or_and.split(line):
                    if member != 'None.':
                        vote.other(member)
            elif 'the motion did not prevail' in line:
                passed = False

        if vote:
            vote['passed'] = passed
            vote.add_source(vote_url)
            bill.add_vote(vote)
Example #41
0
    def scrape_floor_vote(self, chamber, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        MOTION_INDEX = 4
        TOTALS_INDEX = 6
        VOTE_START_INDEX = 9

        motion = lines[MOTION_INDEX].strip()
        # Sometimes there is no motion name, only "Passage" in the line above
        if (not motion
                and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")):
            motion = lines[MOTION_INDEX - 1]
            MOTION_INDEX -= 1
            TOTALS_INDEX -= 1
            VOTE_START_INDEX -= 1
        else:
            assert motion, "Floor vote's motion name appears to be empty"

        for _extra_motion_line in range(2):
            MOTION_INDEX += 1
            if lines[MOTION_INDEX].strip():
                motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip())
                TOTALS_INDEX += 1
                VOTE_START_INDEX += 1
            else:
                break

        (yes_count, no_count, other_count) = [
            int(x) for x in re.search(
                r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$',
                lines[TOTALS_INDEX]).groups()
        ]
        passed = (yes_count > no_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in lines[VOTE_START_INDEX:]:
            if not line.strip():
                break

            if " President " in line:
                line = line.replace(" President ", " ")
            elif " Speaker " in line:
                line = line.replace(" Speaker ", " ")

            # Votes follow the pattern of:
            # [vote code] [member name]-[district number]
            for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line):
                vote.yes(member)
            for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line):
                vote.no(member)
            for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line):
                vote.other(member)

        try:
            vote.validate()
        except ValueError:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.logger.info("Votes don't add up; looking for additional ones")
            for line in lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}',
                                         line):
                    vote.other(member)

        vote.validate()
        bill.add_vote(vote)
Example #42
0
    def scrape_lower_committee_votes(self, session_number, bill):
        '''
        House committee roll calls are not available on the Senate's
        website. Furthermore, the House uses an internal ID system in
        its URLs, making accessing those pages non-trivial.

        This function will fetch all the House committee votes for the
        given bill, and add the votes to that object.
        '''

        house_url = 'http://www.myfloridahouse.gov/Sections/Bills/bills.aspx'

        # Keep the digits and all following characters in the bill's ID
        bill_number = re.search(r'^\w+\s(\d+\w*)$', bill['bill_id']).group(1)

        form = {
            'rblChamber': 'B',
            'ddlSession': session_number,
            'ddlBillList': '-1',
            'txtBillNumber': bill_number,
            'ddlSponsor': '-1',
            'ddlReferredTo': '-1',
            'SubmittedByControl': '',
        }
        doc = lxml.html.fromstring(self.post(url=house_url, data=form).text)
        doc.make_links_absolute(house_url)

        (bill_link, ) = doc.xpath(
            '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href')
        bill_doc = self.lxmlize(bill_link)
        links = bill_doc.xpath('//a[text()="See Votes"]/@href')

        for link in links:
            vote_doc = self.lxmlize(link)

            (date, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()')
            date = datetime.datetime.strptime(date,
                                              '%m/%d/%Y %I:%M:%S %p').date()

            totals = vote_doc.xpath('//table//table')[-1].text_content()
            totals = re.sub(r'(?mu)\s+', " ", totals).strip()
            (yes_count, no_count, other_count) = [
                int(x) for x in re.search(
                    r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+'
                    'Total Missed:\s+(\d+)', totals).groups()
            ]
            passed = yes_count > no_count

            (committee, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()')
            (action, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()')
            motion = "{} ({})".format(action, committee)

            vote = Vote('lower', date, motion, passed, yes_count, no_count,
                        other_count)
            vote.add_source(link)

            for member_vote in vote_doc.xpath('//table//table//table//td'):
                if not member_vote.text_content().strip():
                    continue

                (member, ) = member_vote.xpath('span[2]//text()')
                (member_vote, ) = member_vote.xpath('span[1]//text()')

                if member_vote == "Y":
                    vote.yes(member)
                elif member_vote == "N":
                    vote.no(member)
                elif member_vote == "-":
                    vote.other(member)
                # Parenthetical votes appear to not be counted in the
                # totals for Yea, Nay, _or_ Missed
                elif re.search(r'\([YN]\)', member_vote):
                    continue
                else:
                    raise IndexError(
                        "Unknown vote type found: {}".format(member_vote))

            vote.validate()
            bill.add_vote(vote)
Example #43
0
    def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
        try:
            page = self.get(url).text
        except scrapelib.HTTPError:
            self.warning("A vote page not found for bill {}".format(
                bill['bill_id']))
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        descr = page.xpath("//b")[0].text_content()
        if descr == '':
            #New page method
            descr = page.xpath("//div[@id='content']/center")[0].text

        if "on voice vote" in descr:
            return

        if "committee" in descr.lower():
            return self.scrape_committee_vote(bill, actor, date, motion, page,
                                              url, uniqid)

        passed = None
        if "Passed" in descr:
            passed = True
        elif "Failed" in descr:
            passed = False
        elif "UTAH STATE LEGISLATURE" in descr:
            return
        else:
            self.warning(descr)
            raise NotImplementedError("Can't see if we passed or failed")

        headings = page.xpath("//b")[1:]
        votes = page.xpath("//table")
        sets = zip(headings, votes)
        vdict = {}
        for (typ, votes) in sets:
            txt = typ.text_content()
            arr = [x.strip() for x in txt.split("-", 1)]
            if len(arr) != 2:
                continue
            v_txt, count = arr
            v_txt = v_txt.strip()
            count = int(count)
            people = [
                x.text_content().strip()
                for x in votes.xpath(".//font[@face='Arial']")
            ]

            vdict[v_txt] = {"count": count, "people": people}

        vote = Vote(actor,
                    date,
                    motion,
                    passed,
                    vdict['Yeas']['count'],
                    vdict['Nays']['count'],
                    vdict['Absent or not voting']['count'],
                    _vote_id=uniqid)
        vote.add_source(url)

        for person in vdict['Yeas']['people']:
            vote.yes(person)
        for person in vdict['Nays']['people']:
            vote.no(person)
        for person in vdict['Absent or not voting']['people']:
            vote.other(person)

        self.info("Adding vote to bill")
        bill.add_vote(vote)
Example #44
0
    def scrape_digest(self, bill):
        digest_url = 'http://legisweb.state.wy.us/%(session)s/Digest/%(bill_id)s.htm' % bill

        bill.add_source(digest_url)

        try:
            html = self.urlopen(digest_url)
        except scrapelib.HTTPError:
            self.warning('no digest for %s' % bill['bill_id'])
            return

        doc = lxml.html.fromstring(html)

        ext_title = doc.xpath('//span[@class="billtitle"]')
        if ext_title:
            bill['extended_title'] = ext_title[0].text_content().replace(
                '\r\n', ' ')

        sponsor_span = doc.xpath('//span[@class="sponsors"]')
        sponsors = ''
        if sponsor_span:
            sponsors = sponsor_span[0].text_content().replace('\r\n', ' ')
        else:
            for p in doc.xpath('//p'):
                if p.text_content().lower().startswith('sponsored by'):
                    sponsors = p.text_content().replace('\r\n', ' ')
        if sponsors:
            if 'Committee' in sponsors:
                bill.add_sponsor('sponsor', sponsors)
            else:
                if bill['chamber'] == 'lower':
                    sp_lists = sponsors.split('and Senator(s)')
                else:
                    sp_lists = sponsors.split('and Representative(s)')
                for spl in sp_lists:
                    for sponsor in split_names(spl):
                        bill.add_sponsor('sponsor', sponsor)

        action_re = re.compile('(\d{1,2}/\d{1,2}/\d{4})\s+(H |S )?(.+)')
        vote_total_re = re.compile(
            '(Ayes )?(\d*)(\s*)Nays(\s*)(\d+)(\s*)Excused(\s*)(\d+)(\s*)Absent(\s*)(\d+)(\s*)Conflicts(\s*)(\d+)'
        )

        actions = [
            x.text_content() for x in doc.xpath('//*[@class="actions"]')
        ]
        actions = [x.text_content() for x in doc.xpath('//p')]
        thing = []
        pastHeader = False
        for action in actions:
            if not pastHeader and action_re.match(action):
                pastHeader = True
            if pastHeader:
                thing.append(action)

        actions = thing

        # initial actor is bill chamber
        actor = bill['chamber']

        aiter = iter(actions)
        for line in aiter:
            line = clean_line(line)

            # skip blank lines
            if not line:
                continue

            amatch = action_re.match(line)
            if amatch:
                date, achamber, action = amatch.groups()

                # change actor if one is on this action
                if achamber == 'H ':
                    actor = 'lower'
                elif achamber == 'S ':
                    actor = 'upper'

                date = datetime.datetime.strptime(date, '%m/%d/%Y')
                bill.add_action(actor,
                                action,
                                date,
                                type=categorize_action(action))
            elif line == 'ROLL CALL':
                voters = {}
                # if we hit a roll call, use an inner loop to consume lines
                # in a psuedo-state machine manner, 3 types
                # Ayes|Nays|Excused|... - indicates next line is voters
                # : (Senators|Representatives): ... - voters
                # \d+ Nays \d+ Excused ... - totals
                while True:
                    nextline = clean_line(aiter.next())
                    if not nextline:
                        continue

                    breakers = [
                        "Ayes:", "Nays:", "Nayes:", "Excused:", "Absent:",
                        "Conflicts:"
                    ]

                    for breaker in breakers:
                        if nextline.startswith(breaker):
                            voters_type = breaker[:-1]
                            if voters_type == "Nayes":
                                voters_type = "Nays"
                                self.log("Fixed a case of 'Naye-itis'")
                            nextline = nextline[len(breaker) - 1:]

                    if nextline.startswith(': '):
                        voters[voters_type] = nextline
                    elif nextline in ('Ayes', 'Nays', 'Excused', 'Absent',
                                      'Conflicts'):
                        voters_type = nextline
                    elif vote_total_re.match(nextline):
                        #_, ayes, _, nays, _, exc, _, abs, _, con, _ = \
                        tupple = vote_total_re.match(nextline).groups()
                        ayes = tupple[1]
                        nays = tupple[4]
                        exc = tupple[7]
                        abs = tupple[10]
                        con = tupple[13]

                        passed = (('Passed' in action or 'Do Pass' in action
                                   or 'Did Concur' in action)
                                  and 'Failed' not in action)
                        vote = Vote(actor, date, action, passed, int(ayes),
                                    int(nays),
                                    int(exc) + int(abs) + int(con))

                        for vtype, voters in voters.iteritems():
                            for voter in split_names(voters):
                                if vtype == 'Ayes':
                                    vote.yes(voter)
                                elif vtype == 'Nays':
                                    vote.no(voter)
                                else:
                                    vote.other(voter)
                        # done collecting this vote
                        bill.add_vote(vote)
                        break
Example #45
0
    def scrape_journal(self, url, chamber, session, date):

        filename, response = self.urlretrieve(url)
        self.logger.info('Saved journal to %r' % filename)
        xml = convert_pdf(filename)
        try:
            et = lxml.etree.fromstring(xml)
        except lxml.etree.XMLSyntaxError:
            self.logger.warning('Skipping invalid pdf: %r' % filename)
            return

        lines = self._journal_lines(et)
        while True:
            try:
                line = next(lines)
            except StopIteration:
                break

            text = gettext(line)

            # Go through with vote parse if any of
            # these conditions match.
            if 'Shall' in text:
                if 'bill pass?' in text:
                    pass
                elif 'resolution' in text:
                    pass
                elif 'amendment' in text:
                    pass
                else:
                    continue
            else:
                continue

            # Get the bill_id.
            while True:
                line = next(lines)
                text += gettext(line)
                m = re.search(r'\(\s*([A-Z\.]+\s+\d+)\s*\)', text)
                if m:
                    bill_id = m.group(1)
                    break

            motion = text.strip()
            motion = re.sub(r'\s+', ' ', motion)
            motion, _ = motion.rsplit('(')
            motion = motion.replace('"', '')
            motion = motion.replace(u'“', '')
            motion = motion.replace(u'\u201d', '')
            motion = motion.replace(u' ,', ',')
            motion = motion.strip()
            motion = re.sub(r'[SH].\d+', lambda m: ' %s ' % m.group(), motion)
            motion = re.sub(r'On the question\s*', '', motion, flags=re.I)

            for word, letter in (('Senate', 'S'), ('House', 'H'), ('File',
                                                                   'F')):
                bill_id = bill_id.replace(word, letter)

            bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]]
            self.current_id = bill_id
            votes = self.parse_votes(lines)
            totals = filter(lambda x: isinstance(x, int), votes.values())
            passed = (1.0 * votes['yes_count'] / sum(totals)) >= 0.5
            vote = Vote(motion=motion,
                        passed=passed,
                        chamber=chamber,
                        date=date,
                        session=session,
                        bill_id=bill_id,
                        bill_chamber=bill_chamber,
                        **votes)
            vote.update(votes)
            vote.add_source(url)
            self.save_vote(vote)
Example #46
0
    def scrape_vote(self, bill, name, url):
        match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name)

        if not match:
            return

        chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)]
        motion = match.group(2).strip()

        if motion.startswith('FINAL PASSAGE'):
            type = 'passage'
        elif motion.startswith('AMENDMENT'):
            type = 'amendment'
        elif 'ON 3RD READINT' in motion:
            type = 'reading:3'
        else:
            type = 'other'

        vote = Vote(chamber, None, motion, None,
                    None, None, None)
        vote['type'] = type
        vote.add_source(url)

        with self.urlopen(url) as text:
            (fd, temp_path) = tempfile.mkstemp()
            with os.fdopen(fd, 'wb') as w:
                w.write(text)
            html = pdf_to_lxml(temp_path)
            os.remove(temp_path)

            vote_type = None
            total_re = re.compile('^Total--(\d+)$')
            body = html.xpath('string(/html/body)')

            date_match = re.search('%s (\d{4,4})' % bill['bill_id'], body)
            date = date_match.group(1)
            month = int(date[0:2])
            day = int(date[2:4])
            date = datetime.date(int(bill['session']), month, day)
            vote['date'] = date

            for line in body.replace(u'\xa0', '\n').split('\n'):
                line = line.replace('&nbsp;', '').strip()
                if not line:
                    continue

                if line in ('YEAS', 'NAYS', 'ABSENT'):
                    vote_type = {'YEAS': 'yes', 'NAYS': 'no',
                                 'ABSENT': 'other'}[line]
                elif vote_type:
                    match = total_re.match(line)
                    if match:
                        vote['%s_count' % vote_type] = int(match.group(1))
                    elif vote_type == 'yes':
                        vote.yes(line)
                    elif vote_type == 'no':
                        vote.no(line)
                    elif vote_type == 'other':
                        vote.other(line)

        # The PDFs oddly don't say whether a vote passed or failed.
        # Hopefully passage just requires yes_votes > not_yes_votes
        if vote['yes_count'] > (vote['no_count'] + vote['other_count']):
            vote['passed'] = True
        else:
            vote['passed'] = False

        bill.add_vote(vote)
Example #47
0
    def scrape_votes(self, session):
        votes = {}
        last_line = []

        for line in self.zf.open('tblrollcallsummary.txt'):
            if line.strip() == "":
                continue

            line = line.split('|')
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning('used bad vote line')
                else:
                    last_line = line
                    self.warning('bad vote line %s' % '|'.join(line))
            session_yr = line[0]
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            present = int(line[7])
            absent = int(line[8])
            motion = line[11].strip() or '[not available]'

            if session_yr == session and bill_id in self.bills_by_id:
                actor = 'lower' if body == 'H' else 'upper'
                time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p')
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(actor,
                            time,
                            motion,
                            passed,
                            yeas,
                            nays,
                            other_count=0)
                votes[body + vote_num] = vote
                self.bills_by_id[bill_id].add_vote(vote)

        for line in self.zf.open('tblrollcallhistory.txt'):
            # 2012    | H   | 2    | 330795  | HB309  | Yea |1/4/2012 8:27:03 PM
            session_yr, body, v_num, employee, bill_id, vote, date \
                    = line.split('|')

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = self.legislators[employee]['name']
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if not body + v_num in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body + v_num))
                    continue

                #code = self.legislators[employee]['seat']
                if vote == 'Yea':
                    votes[body + v_num].yes(leg)
                elif vote == 'Nay':
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].other(leg)
                    votes[body + v_num]['other_count'] += 1
Example #48
0
    def scrape_votes(self, url, motion, date, chamber):
        vote_pdf, resp = self.urlretrieve(url)
        text = convert_pdf(vote_pdf, 'text')
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []

        # point at array to add names to
        cur_array = None

        precursors = (
            ('Yeas--', yes_votes),
            ('Nays--', no_votes),
            ('Absent or those not voting--', other_votes),
            ('Absent and those not voting--', other_votes),
            ('Voting Present--', other_votes),
            ('Present--', other_votes),
            ('DISCLAIMER', None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.split('\n'))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line:
                    cur_array = arr
                    line = line.replace(pc, '')

            # split names
            for name in line.split(','):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if 'None.' in name:
                    cur_array = None
                match = re.match(r'(.+?)\. Total--.*', name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in ('on final passage', 'Necessary', 'who would have',
                             'being a tie', 'therefore', 'Vacancies', 'a pair',
                             'Total-', 'ATTORNEY', 'on final passage',
                             'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR',
                             'ARCHIVES', 'SECRETARY'):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == '.':
                        name = name[:-1]
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        other_count = len(other_votes)
        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote['yes_votes'] = yes_votes
        vote['no_votes'] = no_votes
        vote['other_votes'] = other_votes
        return vote
Example #49
0
    def scrape_senate_vote(self, bill, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        lines = text.split('\n')

        date_match = re.search(r'Date:\s+(\d+/\d+/\d+)', text)
        if not date_match:
            self.log("Couldn't find date on %s" % url)
            return

        time_match = re.search(r'Time:\s+(\d+:\d+:\d+)\s+(AM|PM)', text)
        date = "%s %s %s" % (date_match.group(1), time_match.group(1),
                             time_match.group(2))
        date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p")
        date = self._tz.localize(date)

        vote_type = None
        yes_count, no_count, other_count = None, None, 0
        votes = []
        for line in lines[21:]:
            line = line.strip()
            if not line:
                continue

            if line.startswith('YEAS'):
                yes_count = int(line.split(' - ')[1])
                vote_type = 'yes'
            elif line.startswith('NAYS'):
                no_count = int(line.split(' - ')[1])
                vote_type = 'no'
            elif line.startswith('EXCUSED') or line.startswith('NOT VOTING'):
                other_count += int(line.split(' - ')[1])
                vote_type = 'other'
            else:
                votes.extend([(n.strip(), vote_type)
                              for n in re.split(r'\s{2,}', line)])

        if yes_count is None or no_count is None:
            self.log("Couldne't find vote counts in %s" % url)
            return

        passed = yes_count > no_count + other_count

        clean_bill_id = fix_bill_id(bill['bill_id'])
        motion_line = None
        for i, line in enumerate(lines):
            if line.strip() == clean_bill_id:
                motion_line = i + 2
        motion = lines[motion_line]
        if not motion:
            self.log("Couldn't find motion for %s" % url)
            return

        vote = Vote('upper', date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        insert_specific_votes(vote, votes)
        check_vote_counts(vote)

        bill.add_vote(vote)
Example #50
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        sheet_html = self.urlopen(sheet_url)
        sheet_page = lxml.html.fromstring(sheet_html)

        bills = sheet_page.xpath('//table/tr')

        for bill in bills:
            bill_id = self.read_td(bill[index["id"]][0])

            if bill_id == None:
                # Every other entry is null for some reason
                continue

            dot_loc = bill_id.find('.')
            if dot_loc != -1:
                # budget bills are missing the .pdf, don't truncate
                bill_id = bill_id[:dot_loc]
            title_and_sponsor = bill[index["title_sponsor"]][0]

            bill_title = title_and_sponsor.text
            bill_title_and_sponsor = title_and_sponsor.text_content()
            if bill_title is None:
                continue  # Odd ...

            sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                replace(" & ...", "").split("--")

            cats = {
                "SB": "bill",
                "HB": "bill",
                "HR": "resolution",
                "SR": "resolution",
                "SCR": "concurrent resolution",
                "HCR": "concurrent resolution",
                "SJR": "joint resolution",
                "HJR": "joint resolution",
                "SM": "memorial",
                "HM": "memorial"
            }

            bill_type = None

            for cat in cats:
                if bill_id[:len(cat)] == cat:
                    bill_type = cats[cat]

            b = Bill(session,
                     bill_chamber,
                     bill_id,
                     bill_title,
                     type=bill_type)

            b.add_source(sheet_url)

            versions_url = \
                bill[index["version"]].xpath('font/a')[0].attrib["href"]
            versions_url = CO_URL_BASE + versions_url
            versions = self.parse_versions(versions_url)

            for version in versions:
                b.add_version(version['name'],
                              version['link'],
                              mimetype=version['mimetype'])

            bill_history_href = CO_URL_BASE + \
                bill[index["history"]][0][0].attrib['href']
            # ^^^^^^^ We assume this is a full path to the target.
            # might want to consider some better rel-path support
            # XXX: Look at this ^

            history = self.parse_history(bill_history_href)
            b.add_source(bill_history_href)

            chamber_map = dict(Senate='upper', House='lower')
            for action, date in history:
                action_actor = chamber_map.get(chamber, chamber)
                attrs = dict(actor=action_actor, action=action, date=date)
                attrs.update(self.categorizer.categorize(action))
                b.add_action(**attrs)

            for sponsor in sponsors:
                if sponsor != None and sponsor != "(NONE)" and \
                   sponsor != "":
                    b.add_sponsor("primary", sponsor)

            # Now that we have history, let's see if we can't grab some
            # votes

            bill_vote_href = self.get_vote_url(bill_id, session)
            votes = self.parse_votes(bill_vote_href)

            if votes['sanity-check'] != bill_id:
                self.warning("XXX: READ ME! Sanity check failed!")
                self.warning(" -> Scraped ID: " + votes['sanity-check'])
                self.warning(" -> 'Real' ID:  " + bill_id)
                assert votes['sanity-check'] == bill_id

            for vote in votes['votes']:
                filed_votes = vote['votes']
                passage = vote['meta']
                result = vote['result']

                composite_time = "%s %s" % (passage['x-parent-date'],
                                            passage['TIME'])
                # It's now like: 04/01/2011 02:10:14 PM
                pydate = dt.datetime.strptime(composite_time,
                                              "%m/%d/%Y %I:%M:%S %p")
                hasHouse = "House" in passage['x-parent-ctty']
                hasSenate = "Senate" in passage['x-parent-ctty']

                if hasHouse and hasSenate:
                    actor = "joint"
                elif hasHouse:
                    actor = "lower"
                else:
                    actor = "upper"

                other = (int(result['EXC']) + int(result['ABS']))
                # OK, sometimes the Other count is wrong.
                local_other = 0
                for voter in filed_votes:
                    l_vote = filed_votes[voter].lower().strip()
                    if l_vote != "yes" and l_vote != "no":
                        local_other = local_other + 1

                if local_other != other:
                    self.warning( \
                        "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES")
                    self.warning(" -> Old: %s // New: %s" %
                                 (other, local_other))
                    other = local_other

                passed = (result['FINAL_ACTION'] == "PASS")
                if passage['MOTION'].strip() == "":
                    continue

                if "without objection" in passage['MOTION'].lower():
                    passed = True

                v = Vote(actor,
                         pydate,
                         passage['MOTION'],
                         passed,
                         int(result['YES']),
                         int(result['NO']),
                         other,
                         moved=passage['MOVED'],
                         seconded=passage['SECONDED'])

                v.add_source(vote['meta']['url'])
                # v.add_source( bill_vote_href )

                # XXX: Add more stuff to kwargs, we have a ton of data
                seen = set([])
                for voter in filed_votes:
                    who = voter
                    if who in seen:
                        raise Exception("Seeing the double-thing. - bug #702")
                    seen.add(who)

                    vote = filed_votes[who]
                    if vote.lower() == "yes":
                        v.yes(who)
                    elif vote.lower() == "no":
                        v.no(who)
                    else:
                        v.other(who)
                b.add_vote(v)
            self.save_bill(b)
Example #51
0
    def scrape(self, session, chambers):
        HTML_TAGS_RE = r'<.*?>'

        year_slug = session[5:]

        # Load all bills and resolutions via the private API
        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
                'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
                format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.iteritems()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info['BillNumber']))

            # Create the bill using its basic information
            bill = Bill(session=session,
                        bill_id=info['BillNumber'],
                        title=info['Title'],
                        chamber=bill_chamber,
                        type=bill_type)
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = \
                    'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                    format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li')
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                        replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsor(sponsor_type, sponsor_name)

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a |'
                '//ul[@class="bill-path"]//a')
            for version in versions:
                if version.xpath('text()'):
                    bill.add_version(name=version.xpath('text()')[0],
                                     url=version.xpath('@href')[0].replace(
                                         ' ', '%20'),
                                     mimetype='application/pdf')

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format(
                        year_slug), lxml.etree.tostring(doc)).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".\
                        format(info['BillNumber']))
                self.save_bill(bill)
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v.strip() for k, v in action.iteritems()}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'governor'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    assert chambers_passed == set("HS")
                    action_type = 'governor:signed'
                elif actor == 'lower' and \
                        any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')):
                    action_type = 'bill:passed'
                    chambers_passed.add("H")
                elif actor == 'upper' and \
                        any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')):
                    action_type = 'bill:passed'
                    chambers_passed.add("S")
                else:
                    action_type = 'other'

                bill.add_action(actor=actor,
                                action=re.sub(HTML_TAGS_RE, "",
                                              action['FullStatus']),
                                date=datetime.datetime.strptime(
                                    action['StatusDate'], '%m/%d/%Y'),
                                type=action_type)

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\
                        format(year_slug, roll_call_id)
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_other = []
                for member in roll_call:
                    (member_name,
                     _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_other.append(member_name)

                if "Passed -- " in vote['FullStatus']:
                    did_pass = True
                elif "Failed -- " in vote['FullStatus']:
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = \
                        int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = \
                        int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = Vote(chamber=('lower' if vote['ChamberCode']
                                            == 'H' else 'upper'),
                                   date=datetime.datetime.strptime(
                                       vote['StatusDate'], '%m/%d/%Y'),
                                   motion=re.sub(HTML_TAGS_RE, "",
                                                 vote['FullStatus']).strip(),
                                   passed=did_pass,
                                   yes_count=yea_count,
                                   no_count=nay_count,
                                   other_count=len(roll_call_other))
                vote_to_add.add_source(roll_call_url)

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_other:
                    vote_to_add.other(member)

                try:
                    vote_to_add.validate()
                except ValueError as e:
                    self.warning(e)

                bill.add_vote(vote_to_add)

            # Capture extra information
            # This is not in the OpenStates spec, but is available
            # Not yet implemented
            # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            self.save_bill(bill)
Example #52
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session),
                        chamber,
                        bill_id,
                        title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(
            year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)

        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (
                year_abr, document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['doctype']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['doctype'], bill_id))
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            'A%s' % year_abr,
            'A%s' % next_year,
            'S%s' % year_abr,
            'S%s' % next_year,
            'CA%s-%s' % (year_abr, next_year),
            'CS%s-%s' % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith('A') or filename.startswith('CA'):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith('C'):
                vote_file_type = 'committee'
            else:
                vote_file_type = 'chamber'

            for rec in vdict_file:

                if vote_file_type == 'chamber':
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                    leg = rec['Name']
                    # drop time portion
                    date = rec['Agenda_Date'].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec['BillAction']]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec['LegislatorVote'][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = '_'.join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber,
                                          date,
                                          action,
                                          None,
                                          None,
                                          None,
                                          None,
                                          bill_id=bill_id)
                if vote_file_type == 'committee':
                    votes[vote_id]['committee'] = self._committees[
                        rec['Committee_House']]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = actor_map[rec["house"]]
            comment = rec["comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Example #53
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.urlopen(url).replace(u'\xa0', ' '))

        re_ns = "http://exslt.org/regular-expressions"
        path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={'re': re_ns}):
            if 'HOUSE' in header.xpath("string()"):
                chamber = 'lower'
                motion_index = 8
            else:
                chamber = 'upper'
                motion_index = 9

            motion = header.xpath(
                "string(following-sibling::p[%d])" % motion_index).strip()
            motion = re.sub(r'\s+', ' ', motion)
            match = re.match(r'^(.*) (PASSED|FAILED)$', motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == 'PASSED'
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ')
            rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r'\d+/\d+/\d+', date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace('\r\n', ' ').strip()
                if "*****" in line:
                    break

                match = re.match(
                    r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING)\s*:\s*(\d+)',
                    line)
                if match:
                    if match.group(1) == 'YEAS':
                        vtype = 'yes'
                    elif match.group(1) == 'NAYS':
                        vtype = 'no'
                    elif match.group(1) == 'VACANT':
                        continue  # skip these
                    else:
                        vtype = 'other'
                    counts[vtype] += int(match.group(2))
                else:
                    for name in line.split('   '):
                        if not name:
                            continue
                        if 'HOUSE BILL' in name or 'SENATE BILL' in name:
                            continue
                        votes[vtype].append(name.strip())

            assert len(votes['yes']) == counts['yes']
            assert len(votes['no']) == counts['no']
            assert len(votes['other']) == counts['other']

            if passed is None:
                passed = counts['yes'] > (counts['no'] + counts['other'])

            vote = Vote(chamber, date, motion, passed,
                        counts['yes'], counts['no'], counts['other'],
                        rcs_num=rcs)
            vote.add_source(url)

            for name in votes['yes']:
                vote.yes(name)
            for name in votes['no']:
                vote.no(name)
            for name in votes['other']:
                vote.other(name)

            bill.add_vote(vote)
Example #54
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr):
        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            # Construct session for web query, going from '20092010' to '0910'
            source_session = session[2:4] + session[6:8]

            # Turn 'AB 10' into 'ab_10'
            source_num = "%s_%s" % (bill.measure_type.lower(),
                                    bill.measure_num)

            # Construct a fake source url
            source_url = ("http://www.leginfo.ca.gov/cgi-bin/postquery?"
                          "bill_number=%s&sess=%s" %
                          (source_num, source_session))

            fsbill.add_source(source_url)

            scraped_versions = self.scrape_site_versions(source_url)

            title = ''
            short_title = ''
            type = ['bill']
            subject = ''
            all_titles = set()
            i = 0
            for version in bill.versions:
                if not version.bill_xml:
                    continue

                title = clean_title(version.title)
                all_titles.add(title)
                short_title = clean_title(version.short_title)
                type = [bill_type]

                if version.appropriation == 'Yes':
                    type.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type.append('fiscal committee')
                if version.local_program == 'Yes':
                    type.append('local program')
                if version.urgency == 'Yes':
                    type.append('urgency')
                if version.taxlevy == 'Yes':
                    type.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

                date = version.bill_version_action_date.date()

                url = ''
                try:
                    scraped_version = scraped_versions[i]
                    if scraped_version[0] == date:
                        url = scraped_version[1]
                        i += 1
                except IndexError:
                    pass

                fsbill.add_version(version.bill_version_id,
                                   url,
                                   date=date,
                                   title=title,
                                   short_title=short_title,
                                   subject=[subject],
                                   type=type)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['short_title'] = short_title
            fsbill['type'] = type
            fsbill['subjects'] = [subject]

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            fsbill['alternate_titles'] = list(all_titles)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            introduced = False

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    actor = re.sub('^Assembly', 'lower', actor)
                    actor = re.sub('^Senate', 'upper', actor)

                type = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                if act_str.startswith('Introduced'):
                    introduced = True
                    type.append('bill:introduced')

                if 'Read first time.' in act_str:
                    if not introduced:
                        type.append('bill:introduced')
                        introduced = True
                    type.append('bill:reading:1')

                if 'To Com' in act_str or 'referred to' in act_str.lower():
                    type.append('committee:referred')

                if 'Read third time.  Passed.' in act_str:
                    type.append('bill:passed')

                if 'Approved by Governor' in act_str:
                    type.append('governor:signed')

                if 'Item veto' in act_str:
                    type.append('governor:vetoed:line-item')

                if 'Vetoed by Governor' in act_str:
                    type.append('governor:vetoed')

                if 'To Governor' in act_str:
                    type.append('governor:received')

                if 'Read second time' in act_str:
                    type.append('bill:reading:2')

                if not type:
                    type = ['other']

                fsbill.add_action(actor,
                                  act_str,
                                  action.action_date.date(),
                                  type=type)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                motion = vote.motion.motion_text or ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              self._tz.localize(vote.vote_date_time),
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                # The abstain count field in CA's database includes
                # vacancies, which we aren't interested in.
                fsvote['other_count'] = len(fsvote['other_votes'])

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Example #55
0
    def _parse_votes(self, url, vote):
        '''Given a vote url and a vote object, extract the voters and
        the vote counts from the vote page and update the vote object.
        '''
        if url.lower().endswith('.pdf'):

            try:
                resp = self.urlopen(url)
            except HTTPError:
                # This vote document wasn't found.
                msg = 'No document found at url %r' % url
                self.logger.warning(msg)
                return

            try:
                v = PDFCommitteeVote(url, resp.bytes)
                return v.asvote()
            except PDFCommitteeVoteParseError as e:
                # Warn and skip.
                self.warning("Could't parse committee vote at %r" % url)
                return

        keymap = {'Y': 'yes', 'N': 'no'}
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        # Yes, no, excused, absent.
        try:
            vals = doc.xpath('//table')[1].xpath('tr/td/text()')
        except IndexError:
            # Most likely was a bogus link lacking vote data.
            return

        y, n, e, a = map(int, vals)
        vote.update(yes_count=y, no_count=n, other_count=e + a)

        # Get the motion.
        try:
            motion = doc.xpath('//br')[-1].tail.strip()
        except:
            # Some of them mysteriously have no motion listed.
            motion = vote['action']

        vote['motion'] = motion

        # Add placeholder for passed (see below)
        vote['passed'] = False

        vote = Vote(**vote)

        for text in doc.xpath('//table')[2].xpath('tr/td/text()'):
            if not text.strip(u'\xa0'):
                continue
            v, name = filter(None, text.split(u'\xa0'))
            getattr(vote, keymap.get(v, 'other'))(name)

        action = vote['action']

        # Existing code to deterimine value of `passed`
        yes_votes = vote['yes_votes']
        no_votes = vote['no_votes']
        passed = None

        # some actions take a super majority, so we aren't just
        # comparing the yeas and nays here.
        for i in vote_passage_indicators:
            if action.count(i):
                passed = True
        for i in vote_failure_indicators:
            if action.count(i) and passed == True:
                # a quick explanation:  originally an exception was
                # thrown if both passage and failure indicators were
                # present because I thought that would be a bug in my
                # lists.  Then I found 2007 HB 160.
                # Now passed = False if the nays outnumber the yays..
                # I won't automatically mark it as passed if the yays
                # ounumber the nays because I don't know what requires
                # a supermajority in MT.
                if no_votes >= yes_votes:
                    passed = False
                else:
                    raise Exception("passage and failure indicator"
                                    "both present at: %s" % url)
            if action.count(i) and passed == None:
                passed = False
        for i in vote_ambiguous_indicators:
            if action.count(i):
                passed = yes_votes > no_votes
        if passed is None:
            raise Exception("Unknown passage at: %s" % url)

        vote['passed'] = passed

        return vote
Example #56
0
    def scrape(self, chamber, session):
        if chamber not in PAGES:
            return

        url = PAGES[chamber]
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            bill_id = None
            results = {}
            in_vote = False
            cur_date = None
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""

            pdf_url = pdf.attrib['href']
            (path, response) = self.urlretrieve(pdf_url)
            data = convert_pdf(path, type='text')
            os.unlink(path)
            lines = data.splitlines()
            for line in lines:
                date = re.findall(date_re, line)
                if date != [] and not cur_date:
                    date = date[0][0]
                    cur_date = datetime.datetime.strptime(
                        date, "%A, %B %d, %Y")

                if line.strip() == "":
                    in_motion = False
                    continue

                if True in [x in line.lower()
                            for x in ['passed', 'lost']] and in_vote:
                    in_vote = False
                    bills = re.findall(r"(?i)(H|S|J)(B|R|M) (\d+)", line)
                    if bills == [] or cur_motion.strip() == "":
                        bill_id = None
                        results = {}
                        in_vote = False
                        in_motion = False
                        cur_vote = None
                        in_vote = False
                        continue

                    print "CM: ", cur_motion

                    cur_bill_id = "%s%s %s" % (bills[-1])
                    keys = {
                        "YEAS": "yes",
                        "NAYS": "no",
                        "ABSENT AND NOT VOTING": "other"
                    }
                    res = {}
                    for key in keys:
                        if key in results:
                            res[keys[key]] = filter(lambda a: a != "",
                                                    results[key])
                        else:
                            res[keys[key]] = []

                    # results
                    results = {}
                    yes, no, other = len(res['yes']), len(res['no']), \
                                        len(res['other'])
                    chambers = {"H": "lower", "S": "upper", "J": "joint"}
                    try:
                        bc = chambers[cur_bill_id[0]]
                    except KeyError:
                        bc = 'other'

                    vote = Vote(chamber,
                                cur_date,
                                cur_motion, (yes > no),
                                yes,
                                no,
                                other,
                                session=session,
                                bill_id=cur_bill_id,
                                bill_chamber=bc)

                    vote.add_source(pdf_url)
                    vote.add_source(url)

                    for key in res:
                        obj = getattr(vote, key)
                        for person in res[key]:
                            obj(person)

                    self.save_vote(vote)

                    bill_id = None
                    results = {}
                    in_vote = False
                    in_motion = False
                    cur_vote = None
                    in_vote = False
                    cur_motion = ""

                    # print bills
                    # print "VOTE TAKEN"

                if 'VOTES FOR' in line:
                    in_motion = False
                    in_vote = False
                    continue

                if 'ABSET' in line:
                    if in_motion:
                        in_vote = True
                    in_motion = False

                if ":" in line and in_vote:
                    cur_vote, who = line.split(":", 1)
                    who = [x.strip() for x in who.split(';')]
                    results[cur_vote] = who
                    continue

                if in_vote:
                    if cur_vote is None:
                        continue

                    who = [x.strip() for x in line.split(";")]
                    for person in who:
                        # print cur_vote
                        results[cur_vote].append(person)
                    continue

                if "question being" in line:
                    cur_motion = line.strip()
                    in_motion = True
                    continue

                if in_motion:
                    cur_motion += line.strip()
                    continue

                if line.strip() == 'ROLL CALL':
                    in_vote = True
Example #57
0
    def scrape(self, chamber, session):
        # Unfortunately, you now have to request access to FTP.
        # This method of retrieving votes needs to be be changed or
        # fall back to traditional web scraping.
        if session == '2009':
            # 2009 files have a different delimiter and naming scheme.
            vote_data_url = 'ftp://www.ncleg.net/Bill_Status/Vote Data 2009.zip'
            naming_scheme = '{session}{file_label}.txt'
            delimiter = ";"
        else:
            vote_data_url = 'ftp://www.ncleg.net/Bill_Status/Votes%s.zip' % session
            naming_scheme = '{file_label}_{session}.txt'
            delimiter = "\t"
        fname, resp = self.urlretrieve(vote_data_url)
        # fname = "/Users/brian/Downloads/Vote Data 2009.zip"
        zf = ZipFile(fname)

        chamber_code = 'H' if chamber == 'lower' else 'S'

        # Members_YYYY.txt: tab separated
        # 0: id (unique only in chamber)
        # 1: H or S
        # 2: member name
        # 3-5: county, district, party
        # 6: mmUserId
        member_file = zf.open(naming_scheme.format(file_label='Members', session=session))
        members = {}
        for line in member_file.readlines():
            data = line.split(delimiter)
            if data[1] == chamber_code:
                members[data[0]] = data[2]

        # Votes_YYYY.txt
        # 0: sequence number
        # 1: chamber (S/H)
        # 2: date
        # 3: prefix
        # 4: bill_id
        # 5: yes votes
        # 6: no votes
        # 7: excused absences
        # 8: excused votes
        # 9: didn't votes
        # 10: total yes+no
        # 11: sponsor
        # 12: reading info
        # 13: info
        # 20: PASSED/FAILED
        # 21: legislative day
        vote_file = zf.open(naming_scheme.format(file_label='Votes', session=session))
        bill_chambers = {'H':'lower', 'S':'upper'}
        votes = {}
        for line in vote_file.readlines():
            data = line.split(delimiter)
            if len(data) < 24:
                self.warning('line too short %s', data)
                continue
            if data[1] == chamber_code:
                date = datetime.datetime.strptime(data[2][:16],
                                                  '%Y-%m-%d %H:%M')
                if data[3][0] not in bill_chambers:
                    # skip votes that aren't on bills
                    self.log('skipping vote %s' % data[0])
                    continue

                votes[data[0]] = Vote(chamber, date, data[13],
                                      'PASS' in data[20],
                                      int(data[5]),
                                      int(data[6]),
                                      int(data[7])+int(data[8])+int(data[9]),
                                      bill_chamber=bill_chambers[data[3][0]],
                                      bill_id=data[3]+data[4], session=session)

        member_vote_file = zf.open(naming_scheme.format(file_label='MemberVotes', session=session))
        # 0: member id
        # 1: chamber (S/H)
        # 2: vote id
        # 3: vote chamber (always same as 1)
        # 4: vote (Y,N,E,X)
        # 5: pair ID (member)
        # 6: pair order
        # If a vote is paired then it should be counted as an 'other'
        for line in member_vote_file.readlines():
            data = line.split(delimiter)
            if data[1] == chamber_code:
                try:
                    member_voting = members[data[0]]
                except KeyError:
                    self.debug('Member %s not found.' % data[0])
                    continue
                try:
                    vote = votes[data[2]]
                except KeyError:
                    self.debug('Vote %s not found.' % data[2])
                    continue

                # -1 votes are Lt. Gov, not included in count, so we add them
                if data[4] == 'Y' and not data[5]:
                    if data[0] == '-1':
                        vote['yes_count'] += 1
                    vote.yes(member_voting)
                elif data[4] == 'N' and not data[5]:
                    if data[0] == '-1':
                        vote['no_count'] += 1
                    vote.no(member_voting)
                else:
                    # for some reason other_count is high for paired votes
                    if data[5]:
                        vote['other_count'] -= 1
                    # is either E: excused, X: no vote, or paired (doesn't count)
                    vote.other(member_voting)

        for vote in votes.itervalues():
            #vote.validate()
            vote.add_source(vote_data_url)
            self.save_vote(vote)

        # remove file
        zf.close()
        os.remove(fname)
Example #58
0
    def scrape_vote(self, bill, name, url):
        if "VOTE/H" in url:
            vote_chamber = 'lower'
            cols = (1, 5, 9, 13)
            name_offset = 3
            yes_offset = 0
            no_offset = 1
        else:
            vote_chamber = 'upper'
            cols = (1, 6)
            name_offset = 4
            yes_offset = 1
            no_offset = 2

        # Connecticut's SSL is causing problems with Scrapelib, so use Requests
        page = requests.get(url, verify=False).text

        if 'BUDGET ADDRESS' in page:
            return

        page = lxml.html.fromstring(page)

        yes_count = page.xpath(
            "string(//span[contains(., 'Those voting Yea')])")
        yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1))

        no_count = page.xpath(
            "string(//span[contains(., 'Those voting Nay')])")
        no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1))

        other_count = page.xpath("string(//span[contains(., 'Those absent')])")
        other_count = int(re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1))

        need_count = page.xpath("string(//span[contains(., 'Necessary for')])")
        need_count = int(re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1))

        date = page.xpath("string(//span[contains(., 'Taken on')])")
        date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1)
        date = date.replace(' ', '')
        date = datetime.datetime.strptime(date + " " + bill['session'],
                                          "%m/%d %Y").date()

        vote = Vote(vote_chamber, date, name, yes_count > need_count,
                    yes_count, no_count, other_count)
        vote.add_source(url)

        table = page.xpath("//table")[0]
        for row in table.xpath("tr"):
            for i in cols:
                name = row.xpath("string(td[%d])" % (i + name_offset)).strip()

                if not name or name == 'VACANT':
                    continue

                if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)):
                    vote.yes(name)
                elif "N" in row.xpath("string(td[%d])" % (i + no_offset)):
                    vote.no(name)
                else:
                    vote.other(name)

        bill.add_vote(vote)
Example #59
0
    def _scrape_bill_details(self, url, bill):
        html = self.get(url, retry_on_404=True).text
        doc = lxml.html.fromstring(html)

        # summary sections
        summary = doc.xpath(
            '//h4[starts-with(text(), "SUMMARY")]/following-sibling::p/text()')
        if summary and summary[0].strip():
            bill['summary'] = summary[0].strip()

        # versions
        for va in doc.xpath(
                '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'):

            # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D
            date, desc = va.text.split(u' \xa0')
            desc.rsplit(' ', 1)[0]  # chop off last part
            link = va.get('href')
            if 'http' not in link:
                link = '{}{}'.format(BASE_URL, link)
            date = datetime.datetime.strptime(date, '%m/%d/%y')

            # budget bills in VA are searchable but no full text available
            if '+men+' in link:
                self.warning(
                    'not adding budget version, bill text not available')
            else:
                # VA duplicates reprinted bills, lets keep the original name
                bill.add_version(desc,
                                 link,
                                 date=date,
                                 mimetype='text/html',
                                 on_duplicate='use_old')

        # actions
        cached_vote = None
        cached_action = None
        for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/'
                             'li'):
            vote = None

            date, action = ali.text_content().split(u' \xa0')
            actor, action = action.split(': ', 1)

            # Bill history entries purely in parentheses tend to be
            # notes and not actions, so we'll skip them.
            if action.startswith('(') and action.endswith(')'):
                continue

            actor = self.actor_map[actor]
            date = datetime.datetime.strptime(date.strip(), '%m/%d/%y')

            # if action ends in (##-Y ##-N) remove that part
            vrematch = self.vote_strip_re.match(action)
            # The following conditional logic is messy to handle
            # Virginia's crazy and inconsistently formatted bill
            # histories. Someone less harried and tired than me
            # could probably make this much cleaner. - alo
            if vrematch:
                vote_action, y, n, o = vrematch.groups()
                y = int(y)
                n = int(n)
                # Set default count for "other" votes to 0. We have to
                # do this explicitly as it's excluded from the action
                # text when there were no abstentions (the only type of
                # "other" vote encountered thus far).
                if o is None:
                    o = 0
                else:
                    o = int(o)

                vote_url = ali.xpath('a/@href')

                # Caches relevant information from the current action if
                # vote count encountered, then searches for the presence
                # of identical counts in the next entry (we assume that
                # it's probably there). If matching votes are found, it
                # pulls the cached data to create a unified vote record.
                #
                # This is because Virginia usually publishes two lines
                # of history data for a single vote, without guaranteed
                # order, so we cache and unsafely attempt to match on
                # identical vote counts in the next line.
                if cached_vote is None:
                    cached_action = action
                    cached_vote = Vote(actor, date, vote_action, y > n, y, n,
                                       o)
                    if vote_url:
                        cached_vote.add_source(BASE_URL + vote_url[0])
                    continue
                elif cached_vote is not None:
                    if vote_action.startswith(u'VOTE:'):
                        if (vote_url and cached_vote['yes_count'] == y
                                and cached_vote['no_count'] == n
                                and cached_vote['other_count'] == o):
                            vote = cached_vote
                            self._parse_vote(vote, vote_url[0])
                            vote.add_source(BASE_URL + vote_url[0])
                            action = cached_action
                    elif cached_vote['motion'].startswith('VOTE:'):
                        if (cached_vote['yes_count'] == y
                                and cached_vote['no_count'] == n
                                and cached_vote['other_count'] == o):
                            vote = cached_vote
                            vote['motion'] = vote_action
                    else:
                        # Cached vote doesn't match up to the current
                        # one. Save, then cache the current vote to
                        # begin the next search.
                        bill.add_vote(cached_vote)
                        cached_vote = Vote(actor, date, vote_action, y > n, y,
                                           n, o)
                        if vote_url:
                            cached_vote.add_source(BASE_URL + vote_url[0])
                        cached_action = action
                        continue

                if vote:
                    bill.add_vote(vote)
                else:
                    self.error('empty vote')
            else:
                # If this action isn't a vote, but the last one was,
                # there's obviously no additional vote data to match.
                # Go ahead and save the cached data.
                if cached_vote is not None:
                    bill.add_vote(cached_vote)

            cached_vote = cached_action = None

            # categorize actions
            for pattern, atype in self._action_classifiers:
                if re.match(pattern, action):
                    break
            else:
                atype = 'other'

            # if matched a 'None' atype, don't add the action
            if atype:
                bill.add_action(actor, action, date, type=atype)
Example #60
0
    def parse_bill_votes(self, doc, bill):
        params = {
            'chamber': None,
            'date': None,
            'motion': None,
            'passed': None,
            'yes_count': None,
            'no_count': None,
            'other_count': None,
        }
        elems = doc.cssselect('a')

        # MD has a habit of listing votes twice
        seen_votes = set()

        for elem in elems:
            href = elem.get('href')
            if (href and "votes" in href and href.endswith('htm')
                    and href not in seen_votes):
                seen_votes.add(href)
                vote_url = BASE_URL + href
                with self.urlopen(vote_url) as vote_html:
                    vote_doc = lxml.html.fromstring(vote_html)

                    # motion
                    box = vote_doc.xpath(
                        '//td[@colspan=3]/font[@size=-1]/text()')
                    params['motion'] = box[-1]
                    params['type'] = 'other'
                    if 'senate' in href:
                        params['chamber'] = 'upper'
                    else:
                        params['chamber'] = 'lower'
                    for regex, vtype in vote_classifiers.iteritems():
                        if re.findall(regex, params['motion'], re.IGNORECASE):
                            params['type'] = vtype

                    # counts
                    bs = vote_doc.xpath('//td[@width="20%"]/font/b/text()')
                    yeas = int(bs[0].split()[0])
                    nays = int(bs[1].split()[0])
                    excused = int(bs[2].split()[0])
                    not_voting = int(bs[3].split()[0])
                    absent = int(bs[4].split()[0])
                    params['yes_count'] = yeas
                    params['no_count'] = nays
                    params['other_count'] = excused + not_voting + absent
                    params['passed'] = yeas > nays

                    # date
                    # parse the following format: March 23, 2009
                    date_elem = vote_doc.xpath(
                        '//font[starts-with(text(), "Legislative Date")]')[0]
                    params['date'] = datetime.datetime.strptime(
                        date_elem.text[18:], '%B %d, %Y')

                    vote = Vote(**params)

                    status = None
                    for row in vote_doc.cssselect('table')[3].cssselect('tr'):
                        text = row.text_content()
                        if text.startswith('Voting Yea'):
                            status = 'yes'
                        elif text.startswith('Voting Nay'):
                            status = 'no'
                        elif text.startswith('Not Voting') or text.startswith(
                                'Excused'):
                            status = 'other'
                        else:
                            for cell in row.cssselect('a'):
                                getattr(vote, status)(cell.text.strip())

                    vote.add_source(vote_url)
                    bill.add_vote(vote)