Esempio n. 1
0
    def parse_vote(
        self,
        bill,
        action,
        act_chamber,
        act_date,
        url,
        re_vote_text=re.compile(
            r'The question (?:being|to be reconsidered):\s*"(.*?\?)"', re.S),
        re_header=re.compile(
            r'\d{2}-\d{2}-\d{4}\s{10,}\w{,20} Journal\s{10,}\d{,6}\s{,4}')):

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        # Find all chunks of text representing voting reports.
        votes_text = doc.xpath('//pre')[1].text_content()
        votes_text = re_vote_text.split(votes_text)
        votes_data = zip(votes_text[1::2], votes_text[2::2])

        # Process each.
        for motion, text in votes_data:

            yes = no = other = 0

            tally = re.findall(r'\b([YNEA])[A-Z]+:\s{,3}(\d{,3})', text)
            for vtype, vcount in tally:
                vcount = int(vcount) if vcount != '-' else 0
                if vtype == 'Y':
                    yes = vcount
                elif vtype == 'N':
                    no = vcount
                else:
                    other += vcount

            vote = Vote(act_chamber, act_date, motion, yes > no, yes, no,
                        other)

            # In lengthy documents, the "header" can be repeated in the middle
            # of content. This regex gets rid of it.
            vote_lines = re_header.sub('', text)
            vote_lines = vote_lines.split('\r\n')

            vote_type = None
            for vote_list in vote_lines:
                if vote_list.startswith('Yeas: '):
                    vote_list, vote_type = vote_list[6:], vote.yes
                elif vote_list.startswith('Nays: '):
                    vote_list, vote_type = vote_list[6:], vote.no
                elif vote_list.startswith('Excused: '):
                    vote_list, vote_type = vote_list[9:], vote.other
                elif vote_list.startswith('Absent: '):
                    vote_list, vote_type = vote_list[9:], vote.other
                elif vote_list.strip() == '':
                    vote_type = None
                if vote_type:
                    for name in vote_list.split(','):
                        name = name.strip()
                        if name:
                            vote_type(name)

            vote.add_source(url)
            bill.add_vote(vote)
Esempio n. 2
0
    def scrape_house(self, session):
        url = journals % (session, 'House')
        page = self.lxmlize(url)
        hrefs = page.xpath("//font//a")

        for href in hrefs:
            (path, response) = self.urlretrieve(href.attrib['href'])
            data = convert_pdf(path, type='text')

            in_vote = False
            cur_vote = {}
            known_date = None
            cur_vote_count = None
            in_question = False
            cur_question = None
            cur_bill_id = None

            for line in data.split("\n"):
                if known_date is None:
                    dt = date_re.findall(line)
                    if dt != []:
                        dt, dow = dt[0]
                        known_date = datetime.datetime.strptime(
                            dt, "%A, %B %d, %Y")

                non_std = False
                if re.match("(\s+)?\d+.*", line) is None:
                    non_std = True
                    l = line.lower().strip()
                    skip = False
                    blacklist = [
                        "house", "page", "general assembly",
                        "state of colorado", "session", "legislative day"
                    ]
                    for thing in blacklist:
                        if thing in l:
                            skip = True
                    if skip:
                        continue

                found = re.findall(
                    "(?P<bill_id>(H|S|SJ|HJ)(B|M|R)\d{2}-\d{4})", line)
                if found != []:
                    found = found[0]
                    cur_bill_id, chamber, typ = found

                try:
                    if not non_std:
                        _, line = line.strip().split(" ", 1)
                    line = line.strip()
                except ValueError:
                    in_vote = False
                    in_question = False
                    continue

                if in_question:
                    cur_question += " " + line
                    continue

                if ("The question being" in line) or \
                   ("On motion of" in line) or \
                   ("the following" in line) or \
                   ("moved that the" in line):
                    cur_question = line
                    in_question = True

                if in_vote:
                    if line == "":
                        in_vote = False
                        continue

                    likely_garbage = False
                    if "co-sponsor" in line.lower():
                        likely_garbage = True

                    if 'the speaker' in line.lower():
                        likely_garbage = True
                        votes = []

                    votes = re.findall(votes_re, line)
                    if likely_garbage:
                        votes = []

                    for person, _, v in votes:
                        cur_vote[person] = v

                    if votes == []:
                        in_vote = False
                        # save vote
                        yes, no, other = cur_vote_count
                        if cur_bill_id is None:
                            continue

                        bc = {
                            "H": "lower",
                            "S": "upper",
                            "J": "joint"
                        }[cur_bill_id[0].upper()]

                        vote = Vote('upper',
                                    known_date,
                                    cur_question, (yes > no),
                                    yes,
                                    no,
                                    other,
                                    session=session,
                                    bill_id=cur_bill_id,
                                    bill_chamber=bc)

                        vote.add_source(href.attrib['href'])
                        vote.add_source(url)

                        for person in cur_vote:
                            if person is None:
                                continue

                            vot = cur_vote[person]
                            if vot == 'Y':
                                vote.yes(person)
                            elif vot == 'N':
                                vote.no(person)
                            elif vot == 'E' or vot == '-':
                                vote.other(person)

                        self.save_vote(vote)

                        cur_vote = {}
                        in_question = False
                        cur_question = None
                        in_vote = False
                        cur_vote_count = None
                        continue

                summ = vote_re.findall(line)
                if summ == []:
                    continue
                summ = summ[0]
                yes, no, exc, ab = summ
                yes, no, exc, ab = \
                        int(yes), int(no), int(exc), int(ab)
                other = exc + ab
                cur_vote_count = (yes, no, other)
                in_vote = True
                continue
            os.unlink(path)
Esempio n. 3
0
    def parse_senate_vote(self, url):
        """ senate PDFs -> garbled text -> good text -> Vote """
        vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0)
        vote.add_source(url)

        fname, resp = self.urlretrieve(url)
        # this gives us the cleaned up text
        sv_text = convert_sv_text(convert_pdf(fname, 'text'))
        os.remove(fname)
        in_votes = False
        flag = None
        overrides = {"ONEILL": "O'NEILL"}

        vote_override = {
            ("SB0112SVOTE.PDF", "RYAN"): vote.other,  # Recused
            ("HB0144SVOTE.PDF", "SOULES"): vote.other,  # Recused
            ("HJR15SVOTE.PDF", "KELLER"): vote.other,  # Recused
        }

        # use in_votes as a sort of state machine
        for line in sv_text:
            # not 'in_votes', get date or passage

            if "bT" in line:  # Whatever generates this text renders the cross
                # in the table as a bT
                continue

            # GARBAGE_SPECIAL = ["'", "%", "$", "&"]
            # for x in GARBAGE_SPECIAL:
            #     for y in [" {} ", "{} ", " {}"]:
            #         line = line.replace(y.format(x), " ")

            if not in_votes:
                dmatch = re.search('DATE: (\d{2}-\d{2}-\d{2})', line)
                if dmatch:
                    date = dmatch.groups()[0]
                    vote['date'] = datetime.strptime(date, '%m-%d-%y')

                els = re.findall("YES.*NO.*ABS.*EXC", line)
                if els != []:
                    flag = line[0]
                    in_votes = True

                if 'PASSED' in line:
                    vote['passed'] = True

            # in_votes: totals & votes
            else:
                line = line.replace(flag, "|")
                # totals
                if 'TOTALS' in line:
                    # Lt. Governor voted
                    if 'GOVERNOR' in line:
                        _, name, y, n, a, e = [
                            x.strip() for x in line.split("|")
                        ][:6]
                        assert name == "LT. GOVERNOR"
                        if y == "X":
                            vote.yes(name)
                        elif n == "X":
                            vote.no(name)
                        elif a == "X" or e == "X":
                            vote.other(name)
                        else:
                            raise ValueError("Bad parse")

                    name, yes, no, abs, exc = [
                        x.strip() for x in line.split("|")
                    ][6:-1]

                    vote['yes_count'] = int(yes)
                    vote['no_count'] = int(no)
                    vote['other_count'] = int(abs) + int(exc)
                    # no longer in votes
                    in_votes = False
                    continue

                # pull votes out
                matches = re.match(
                    ' ([A-Z,\'\-.]+)(\s+)X\s+([A-Z,\'\-.]+)(\s+)X', line)

                votes = [x.strip() for x in line.split("|")][1:-1]
                vote1 = votes[:5]
                vote2 = votes[5:]

                for voted in [vote1, vote2]:
                    name = "".join(voted[:2])
                    if name in overrides:
                        name = overrides[name]
                        voted.pop(0)
                        voted[0] = name

                    name, yes, no, abs, exc = voted

                    if "District" in name:
                        continue

                    if yes == "X":
                        vote.yes(name)
                    elif no == "X":
                        vote.no(name)
                    elif abs == "X" or exc == "X":
                        vote.other(name)
                    else:
                        key = (os.path.basename(url), name)
                        if key in vote_override:
                            vote_override[key](name)
                        else:
                            raise ValueError("Bad parse")

        if not isinstance(vote['date'], datetime):
            return None

        return vote
Esempio n. 4
0
    def scrape_vote(self, bill, name, url):
        match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name)

        if not match:
            return

        chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)]
        motion = match.group(2).strip()

        if motion.startswith('FINAL PASSAGE'):
            type = 'passage'
        elif motion.startswith('AMENDMENT'):
            type = 'amendment'
        elif 'ON 3RD READING' in motion:
            type = 'reading:3'
        else:
            type = 'other'

        vote = Vote(chamber, None, motion, None, None, None, None)
        vote['type'] = type
        vote.add_source(url)

        (fd, temp_path) = tempfile.mkstemp()
        self.urlretrieve(url, temp_path)

        html = pdf_to_lxml(temp_path)
        os.close(fd)
        os.remove(temp_path)

        vote_type = None
        total_re = re.compile('^Total--(\d+)$')
        body = html.xpath('string(/html/body)')

        date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body)
        try:
            date = date_match.group(1)
        except AttributeError:
            self.warning("BAD VOTE: date error")
            return

        vote['date'] = dt.datetime.strptime(date, '%m/%d/%Y')

        for line in body.replace(u'\xa0', '\n').split('\n'):
            line = line.replace('&nbsp;', '').strip()
            if not line:
                continue

            if line in ('YEAS', 'NAYS', 'ABSENT'):
                vote_type = {
                    'YEAS': 'yes',
                    'NAYS': 'no',
                    'ABSENT': 'other'
                }[line]
            elif line in ('Total', '--'):
                vote_type = None
            elif vote_type:
                match = total_re.match(line)
                if match:
                    vote['%s_count' % vote_type] = int(match.group(1))
                elif vote_type == 'yes':
                    vote.yes(line)
                elif vote_type == 'no':
                    vote.no(line)
                elif vote_type == 'other':
                    vote.other(line)

        # tally counts
        vote['yes_count'] = len(vote['yes_votes'])
        vote['no_count'] = len(vote['no_votes'])
        vote['other_count'] = len(vote['other_votes'])

        # The PDFs oddly don't say whether a vote passed or failed.
        # Hopefully passage just requires yes_votes > not_yes_votes
        if vote['yes_count'] > (vote['no_count'] + vote['other_count']):
            vote['passed'] = True
        else:
            vote['passed'] = False

        bill.add_vote(vote)
Esempio n. 5
0
    def scrape(self, chamber, session):
        #determining the start year of the term
        start_year = self.metadata['session_details'][session][
            'start_date'].year

        # URL building
        if chamber == 'upper':
            url_chamber_name = 'senate'
            norm_chamber_name = 'Senate'
            chamber_letter = 'S'
        else:
            url_chamber_name = 'house'
            norm_chamber_name = 'House'
            chamber_letter = 'H'

        assembly_url = urljoin(self.site_root,
                               '/assembly/%s-%s' % (session, start_year))

        chamber_url = '/bill-text/%s-bill.html' % (url_chamber_name)
        bill_list_url = assembly_url + chamber_url
        subject_url = assembly_url + '/subject-index/major-topic.html'

        if not self.subjects:
            self._scrape_subjects(subject_url)

        with self.urlopen(bill_list_url) as html:
            list_page = lxml.html.fromstring(html)
            # connects bill_num with bill details page
            bills_url_dict = {}
            #connects bill_num with bills to be accessed later.
            bills_id_dict = {}
            title = ''
            for bills in list_page.xpath('/html/body/table[3]/tr/th/a'):
                bill_num = bills.text
                bill_url = bill_list_url[0:-26] + '/' + bills.attrib['href'][
                    2:len(bills.attrib['href'])]
                bill_prefix, bill_type = self.bill_type_info(bill_num)
                bill_id = '%s%s %s' % (chamber_letter, bill_prefix, bill_num)
                bill = Bill(session,
                            chamber,
                            bill_id,
                            title,
                            type=bill_type,
                            subjects=self.subjects[bill_id])

                #versions
                versions_url = assembly_url + '/bill-index/bi' + bill_num + '.html'

                #sources
                bill.add_source(bill_url)
                bill.add_source(bill_list_url)

                #storing bills to be accessed
                bills_url_dict[bill_num] = bill_url
                bills_id_dict[bill_num] = bill

            #bill details page
            for bill_keys in bills_url_dict.keys():
                url = bills_url_dict[bill_keys]
                curr_bill = bills_id_dict[bill_keys]
                with self.urlopen(url) as bill_html:
                    bill_page = lxml.html.fromstring(bill_html)
                    for bill_info in bill_page.xpath(
                            '/html/body/table[4]/tr/td'):
                        info = bill_info.text

                        #Sponsors
                        if "Introduced" in info:
                            if ('Rep' in info) or ('Sen' in info):
                                rep = info[14:17]
                                info = info[18:len(info)]
                                sponsors = info.split(',')
                            else:
                                sponsors = [info[13:len(info)]]
                                rep = ''
                            for sponsor in sponsors:
                                if sponsor == sponsors[0]:
                                    sponsor_type = 'primary'
                                else:
                                    sponsor_type = 'cosponsor'
                                curr_bill.add_sponsor(sponsor_type,
                                                      sponsor.strip())
                        else:
                            #title
                            title = info.strip()
                            curr_bill["title"] = title

                    #actions
                    last_date = datetime
                    actor = ''
                    action_num = len(
                        bill_page.xpath('/html/body/table[5]//tr'))
                    for actions in range(2, action_num, 2):
                        path = '//table[5]/tr[%s]/' % (actions)
                        action = bill_page.xpath(path + 'td[4]')[0].text

                        raw_actor = bill_page.xpath(path + 'td[2]')[0].text
                        if not raw_actor:
                            pass
                        elif raw_actor.strip() == 'Senate':
                            actor = 'upper'
                        else:
                            actor = 'lower'

                        action_date = bill_page.xpath(
                            path +
                            'th')[0].text.strip() + '/' + str(start_year)
                        if action_date == ('/' + str(start_year)):
                            action_date = last_date
                        else:
                            action_date = datetime.strptime(
                                action_date, '%m/%d/%Y')
                        last_date = action_date

                        atype = categorize_action(action)
                        curr_bill.add_action(actor, action, action_date, atype)

                        #votes
                        if "yeas" in action:
                            yes_count = int(
                                action.split()[action.split().index('yeas') +
                                               1])
                            no_count = action.split()[
                                action.split().index('nays') + 1]
                            no_count = int(
                                no_count[0:-1]) if ',' in no_count else int(
                                    no_count)
                            passed = True if yes_count > no_count else False
                            vote_type = self.vote_type_info(action)

                            vote = Vote(actor, action_date, action, passed,
                                        yes_count, no_count, 0, vote_type)
                            curr_bill.add_vote(vote)

                        #document within actions
                        doc_num_pos = len(bill_page.xpath(path + 'td'))
                        if doc_num_pos > 5:
                            doc_name = bill_page.xpath(
                                path + 'td[6]/a')[0].attrib['href']
                            doc_url = url[0:url.find('bill')].replace(
                                '///', '/') + doc_name[3:len(doc_name)]

                #versions
                bill_num = curr_bill['bill_id'].split()[1]
                versions_url = assembly_url + '/bill-index/bi' + bill_num + '.html'
                curr_bill.add_source(versions_url)
                with self.urlopen(versions_url) as versions_page:
                    versions_page = lxml.html.fromstring(versions_page)
                    version_count = 2
                    for versions in versions_page.xpath('//table[4]/tr'):
                        tds = versions.xpath("./*")
                        if len(tds) < 3:
                            continue

                        link = tds[2]
                        link = link.xpath("./a")[0]
                        link_name = link.text_content().strip()

                        link = "%s/%s" % (assembly_url + '/bill-index',
                                          link.attrib['href'])
                        curr_bill.add_version(link_name,
                                              link,
                                              mimetype='application/pdf')

                self.save_bill(curr_bill)
Esempio n. 6
0
    def scrape_votes(self, session):
        votes = {}
        last_line = []

        lines = self.get(
            'http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt'
        ).content.splitlines()

        for line in lines:

            if len(line) < 2:
                continue

            if line.strip() == "":
                continue

            line = line.split('|')
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning('used bad vote line')
                else:
                    last_line = line
                    self.warning('bad vote line %s' % '|'.join(line))
            session_yr = line[0].replace('\xef\xbb\xbf', '')
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            present = int(line[7])
            absent = int(line[8])
            motion = line[11].strip() or '[not available]'

            if session_yr == session and bill_id in self.bills_by_id:
                actor = 'lower' if body == 'H' else 'upper'
                time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p')
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(actor,
                            time,
                            motion,
                            passed,
                            yeas,
                            nays,
                            other_count=0)
                votes[body + vote_num] = vote
                self.bills_by_id[bill_id].add_vote(vote)

        for line in self.get(
                'http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt'
        ).content.splitlines():
            if len(line) < 2:
                continue

            # 2016|H|2|330795||Yea|
            # 2012    | H   | 2    | 330795  | 964 |  HB309  | Yea | 1/4/2012 8:27:03 PM
            session_yr, body, v_num, _, employee, bill_id, vote, date = \
                line.split('|')

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = self.legislators[employee]['name']
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body + v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body + v_num))
                    continue

                #code = self.legislators[employee]['seat']
                if vote == 'Yea':
                    votes[body + v_num].yes(leg)
                elif vote == 'Nay':
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].other(leg)
                    votes[body + v_num]['other_count'] += 1
Esempio n. 7
0
 def asvote(self):
     v = Vote(**self.asdict())
     for key in 'yes_votes no_votes other_votes'.split():
         v[key] = getattr(self, key)()
     v.add_source(self.url)
     return v
Esempio n. 8
0
    def scrape_journal(self, url, chamber, session, date):

        filename, response = self.urlretrieve(url)
        self.logger.info('Saved journal to %r' % filename)
        xml = convert_pdf(filename)
        try:
            et = lxml.etree.fromstring(xml)
        except lxml.etree.XMLSyntaxError:
            self.logger.warning('Skipping invalid pdf: %r' % filename)
            return

        lines = self._journal_lines(et)
        while True:
            try:
                line = next(lines)
            except StopIteration:
                break

            text = gettext(line)

            # Go through with vote parse if any of
            # these conditions match.
            if 'Shall' in text:
                if 'bill pass?' in text:
                    pass
                elif 'resolution' in text:
                    pass
                elif 'amendment' in text:
                    pass
                else:
                    continue
            else:
                continue

            # Get the bill_id.
            bill_id = None
            for line in lines:
                text += gettext(line)
                m = re.search(r'\(\s*([A-Z\.]+\s+\d+)\s*\)',  text)
                if m:
                    bill_id = m.group(1)
                    break

            motion = text.strip()
            motion = re.sub(r'\s+', ' ', motion)
            motion, _ = motion.rsplit('(', 1)
            motion = motion.replace('"', '')
            motion = motion.replace(u'“', '')
            motion = motion.replace(u'\u201d', '')
            motion = motion.replace(u' ,', ',')
            motion = motion.strip()
            motion = re.sub(r'[SH].\d+', lambda m: ' %s ' % m.group(), motion)
            motion = re.sub(r'On the question\s*', '', motion, flags=re.I)

            for word, letter in (('Senate', 'S'),
                                 ('House', 'H'),
                                 ('File', 'F')):

                if bill_id is None:
                    return

                bill_id = bill_id.replace(word, letter)

            bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]]
            self.current_id = bill_id
            votes = self.parse_votes(lines)
            totals = filter(lambda x: isinstance(x, int), votes.values())
            passed = (1.0 * votes['yes_count'] / sum(totals)) >= 0.5
            vote = Vote(motion=motion,
                        passed=passed,
                        chamber=chamber, date=date,
                        session=session, bill_id=bill_id,
                        bill_chamber=bill_chamber,
                        **votes)
            vote.update(votes)
            vote.add_source(url)
            self.save_vote(vote)
Esempio n. 9
0
    def scrape_votes(self, url, motion, date, chamber):
        vote_pdf, resp = self.urlretrieve(url)
        text = convert_pdf(vote_pdf, 'text')
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []

        # point at array to add names to
        cur_array = None

        precursors = (
            ('Yeas--', yes_votes),
            ('Nays--', no_votes),
            ('Absent or those not voting--', other_votes),
            ('Absent and those not voting--', other_votes),
            ('Voting Present--', other_votes),
            ('Present--', other_votes),
            ('DISCLAIMER', None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.split('\n'))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line:
                    cur_array = arr
                    line = line.replace(pc, '')

            # split names
            for name in line.split(','):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if 'None.' in name:
                    cur_array = None
                match = re.match(r'(.+?)\. Total--.*', name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in ('on final passage', 'Necessary', 'who would have',
                             'being a tie', 'therefore', 'Vacancies', 'a pair',
                             'Total-', 'ATTORNEY', 'on final passage',
                             'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR',
                             'ARCHIVES', 'SECRETARY'):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == '.':
                        name = name[:-1]
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        other_count = len(other_votes)
        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote['yes_votes'] = yes_votes
        vote['no_votes'] = no_votes
        vote['other_votes'] = other_votes
        return vote
Esempio n. 10
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session),
                        chamber,
                        bill_id,
                        title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(
            year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)

        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (
                year_abr, document.replace('.DOC', '.HTM'))

            # name document based _doctype
            doc_name = self._doctypes[rec['doctype']]
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            'A%s' % year_abr,
            'A%s' % next_year,
            'S%s' % year_abr,
            'S%s' % next_year,
            'CA%s-%s' % (year_abr, next_year),
            'CS%s-%s' % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith('A') or filename.startswith('CA'):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith('C'):
                vote_file_type = 'committee'
            else:
                vote_file_type = 'chamber'

            for rec in vdict_file:

                if vote_file_type == 'chamber':
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                    leg = rec['Name']
                    # drop time portion
                    date = rec['Agenda_Date'].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec['BillAction']]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec['LegislatorVote'][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = '_'.join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber,
                                          date,
                                          action,
                                          None,
                                          None,
                                          None,
                                          None,
                                          bill_id=bill_id)
                if vote_file_type == 'committee':
                    votes[vote_id]['committee'] = self._committees[
                        rec['Committee_House']]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')

        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Esempio n. 11
0
    def scrape(self, session, chambers):
        HTML_TAGS_RE = r'<.*?>'

        year_slug = session[5:]

        # Load all bills and resolutions via the private API
        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
                format(year_slug)
        bills_json = self.urlopen(bills_url)
        bills = json.loads(bills_json)['data']

        resolutions_url = \
                'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
                format(year_slug)
        resolutions_json = self.urlopen(resolutions_url)
        bills.extend(json.loads(resolutions_json)['data'])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.iteritems()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info['BillNumber']))

            # Create the bill using its basic information
            bill = Bill(session=session,
                        bill_id=info['BillNumber'],
                        title=info['Title'],
                        chamber=bill_chamber,
                        type=bill_type)
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = \
                    'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                    format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li')
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                        replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsor(sponsor_type, sponsor_name)

            # Capture bill text versions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a')
            for version in versions:
                bill.add_version(name=version.xpath('text()')[0],
                                 url=version.xpath('@href')[0].replace(
                                     ' ', '%20'),
                                 mimetype='application/pdf')

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format(
                        year_slug), lxml.etree.tostring(doc)).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".\
                        format(info['BillNumber']))
                self.save_bill(bill)
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            actions_json = self.urlopen(actions_url)
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v.strip() for k, v in action.iteritems()}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'governor'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    assert chambers_passed == set("HS")
                    action_type = 'governor:signed'
                elif actor == 'lower' and \
                        action['FullStatus'] in (
                        "Passed", "Read Third time and Passed",
                        "Read and Adopted in Concurrence", "Read and Adopted",
                        "Adopted", "Adopted in Concurrence"):
                    action_type = 'bill:passed'
                    assert "H" not in chambers_passed
                    chambers_passed.add("H")
                elif actor == 'upper' and \
                        any(action['FullStatus'].startswith(x) for x in (
                        "Read 3rd time & passed",
                        "Read & adopted", "Adopted")):
                    action_type = 'bill:passed'
                    assert "S" not in chambers_passed
                    chambers_passed.add("S")
                else:
                    action_type = 'other'

                bill.add_action(actor=actor,
                                action=re.sub(HTML_TAGS_RE, "",
                                              action['FullStatus']),
                                date=datetime.datetime.strptime(
                                    action['StatusDate'], '%m/%d/%Y'),
                                type=action_type)

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            votes_json = self.urlopen(votes_url)
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\
                        format(year_slug, roll_call_id)
                roll_call_json = self.urlopen(roll_call_url)
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_other = []
                for member in roll_call:
                    (member_name,
                     _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_other.append(member_name)

                if "Passed -- " in vote['FullStatus']:
                    did_pass = True
                elif "Failed -- " in vote['FullStatus']:
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = \
                        int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = \
                        int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))
                if yea_count != len(roll_call_yea) or \
                        nay_count != len(roll_call_nay):
                    raise AssertionError(
                        "Yea and/or nay counts incongruous:\n" +
                        "Yeas from vote text: {}\n".format(yea_count) +
                        "Yeas from number of members: {}\n".format(
                            len(roll_call_yea)) +
                        "Nays from vote text: {}\n".format(nay_count) +
                        "Nays from number of members: {}".format(
                            len(roll_call_nay)))

                vote_to_add = Vote(chamber=('lower' if vote['ChamberCode']
                                            == 'H' else 'upper'),
                                   date=datetime.datetime.strptime(
                                       vote['StatusDate'], '%m/%d/%Y'),
                                   motion=re.sub(HTML_TAGS_RE, "",
                                                 vote['FullStatus']).strip(),
                                   passed=did_pass,
                                   yes_count=yea_count,
                                   no_count=nay_count,
                                   other_count=len(roll_call_other))
                vote_to_add.add_source(roll_call_url)

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_other:
                    vote_to_add.other(member)

                bill.add_vote(vote_to_add)

            # Capture extra information
            # This is not in the OpenStates spec, but is available
            # Not yet implemented
            # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            self.save_bill(bill)
Esempio n. 12
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' '))

        re_ns = "http://exslt.org/regular-expressions"
        path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={'re': re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if 'HOUSE' in header.xpath("string()"):
                chamber = 'lower'
                motion_index = 8
            else:
                chamber = 'upper'
                motion_index = 13

            motion = header.xpath("string(following-sibling::p[%d])" %
                                  motion_index).strip()
            motion = re.sub(r'\s+', ' ', motion)
            assert motion.strip(), "Motion text not found"
            match = re.match(r'^(.*) (PASSED|FAILED)$', motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == 'PASSED'
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ')
            rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r'\d+/\d+/\d+', date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace('\r\n', ' ').strip()
                if "*****" in line:
                    break

                match = re.match(
                    r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)',
                    line)
                if match:
                    if match.group(1) == 'YEAS' and 'RCS#' not in line:
                        vtype = 'yes'
                        seen_yes = True
                    elif match.group(1) == 'NAYS' and seen_yes:
                        vtype = 'no'
                    elif match.group(1) == 'VACANT':
                        continue  # skip these
                    elif seen_yes:
                        vtype = 'other'
                    if seen_yes and match.group(3).strip():
                        self.logger.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split('   '):
                        if not name:
                            continue
                        if 'HOUSE' in name or 'SENATE ' in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts['yes'] > (counts['no'] + counts['other'])

            vote = Vote(chamber,
                        date,
                        motion,
                        passed,
                        counts['yes'],
                        counts['no'],
                        counts['other'],
                        rcs_num=rcs)
            vote.validate()

            vote.add_source(url)

            for name in votes['yes']:
                vote.yes(name)
            for name in votes['no']:
                if ':' in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes['other']:
                vote.other(name)

            vote.validate()
            bill.add_vote(vote)
Esempio n. 13
0
    def scrape_vote(self, bill, name, url):
        if "VOTE/H" in url:
            vote_chamber = 'lower'
            cols = (1, 5, 9, 13)
            name_offset = 3
            yes_offset = 0
            no_offset = 1
        else:
            vote_chamber = 'upper'
            cols = (1, 6)
            name_offset = 4
            yes_offset = 1
            no_offset = 2

        # Connecticut's SSL is causing problems with Scrapelib, so use Requests
        page = requests.get(url, verify=False).text

        if 'BUDGET ADDRESS' in page:
            return

        page = lxml.html.fromstring(page)

        yes_count = page.xpath(
            "string(//span[contains(., 'Those voting Yea')])")
        yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1))

        no_count = page.xpath(
            "string(//span[contains(., 'Those voting Nay')])")
        no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1))

        other_count = page.xpath("string(//span[contains(., 'Those absent')])")
        other_count = int(re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1))

        need_count = page.xpath("string(//span[contains(., 'Necessary for')])")
        need_count = int(re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1))

        date = page.xpath("string(//span[contains(., 'Taken on')])")
        date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1)
        date = date.replace(' ', '')
        date = datetime.datetime.strptime(date + " " + bill['session'],
                                          "%m/%d %Y").date()

        vote = Vote(vote_chamber, date, name, yes_count > need_count,
                    yes_count, no_count, other_count)
        vote.add_source(url)

        table = page.xpath("//table")[0]
        for row in table.xpath("tr"):
            for i in cols:
                name = row.xpath("string(td[%d])" % (i + name_offset)).strip()

                if not name or name == 'VACANT':
                    continue

                if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)):
                    vote.yes(name)
                elif "N" in row.xpath("string(td[%d])" % (i + no_offset)):
                    vote.no(name)
                else:
                    vote.other(name)

        bill.add_vote(vote)
Esempio n. 14
0
    def scrape(self, chamber, session):
        chamber_name = 'senate' if chamber == 'lower' else 'house'
        session_slug = {'62': '62-2011', '63': '63-2013'}[session]

        url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % (
            session_slug, chamber_name)
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            bill_id = None
            results = {}
            in_vote = False
            cur_date = None
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""

            pdf_url = pdf.attrib['href']
            (path, response) = self.urlretrieve(pdf_url)
            data = convert_pdf(path, type='text')
            os.unlink(path)
            lines = data.splitlines()
            for line in lines:
                date = re.findall(date_re, line)
                if date != [] and not cur_date:
                    date = date[0][0]
                    cur_date = datetime.datetime.strptime(
                        date, "%A, %B %d, %Y")

                if line.strip() == "":
                    in_motion = False
                    continue

                if True in [x in line.lower()
                            for x in ['passed', 'lost']] and in_vote:
                    in_vote = False
                    bills = re.findall(r"(?i)(H|S|J)(B|R|M) (\d+)", line)
                    if bills == [] or cur_motion.strip() == "":
                        bill_id = None
                        results = {}
                        in_vote = False
                        in_motion = False
                        cur_vote = None
                        in_vote = False
                        continue

                    print "CM: ", cur_motion

                    cur_bill_id = "%s%s %s" % (bills[-1])
                    keys = {
                        "YEAS": "yes",
                        "NAYS": "no",
                        "ABSENT AND NOT VOTING": "other"
                    }
                    res = {}
                    for key in keys:
                        if key in results:
                            res[keys[key]] = filter(lambda a: a != "",
                                                    results[key])
                        else:
                            res[keys[key]] = []

                    # results
                    results = {}
                    yes, no, other = len(res['yes']), len(res['no']), \
                                        len(res['other'])
                    chambers = {"H": "lower", "S": "upper", "J": "joint"}
                    try:
                        bc = chambers[cur_bill_id[0]]
                    except KeyError:
                        bc = 'other'

                    vote = Vote(chamber,
                                cur_date,
                                cur_motion, (yes > no),
                                yes,
                                no,
                                other,
                                session=session,
                                bill_id=cur_bill_id,
                                bill_chamber=bc)

                    vote.add_source(pdf_url)
                    vote.add_source(url)

                    for key in res:
                        obj = getattr(vote, key)
                        for person in res[key]:
                            obj(person)

                    self.save_vote(vote)

                    bill_id = None
                    results = {}
                    in_vote = False
                    in_motion = False
                    cur_vote = None
                    in_vote = False
                    cur_motion = ""

                    # print bills
                    # print "VOTE TAKEN"

                if 'VOTES FOR' in line:
                    in_motion = False
                    in_vote = False
                    continue

                if 'ABSET' in line:
                    if in_motion:
                        in_vote = True
                    in_motion = False

                if ":" in line and in_vote:
                    cur_vote, who = line.split(":", 1)
                    who = [x.strip() for x in who.split(';')]
                    results[cur_vote] = who
                    continue

                if in_vote:
                    if cur_vote is None:
                        continue

                    who = [x.strip() for x in line.split(";")]
                    for person in who:
                        # print cur_vote
                        results[cur_vote].append(person)
                    continue

                if "question being" in line:
                    cur_motion = line.strip()
                    in_motion = True
                    continue

                if in_motion:
                    cur_motion += line.strip()
                    continue

                if line.strip() == 'ROLL CALL':
                    in_vote = True
Esempio n. 15
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        html = self.get(url).text
        if "error '80020009'" in html:
            self.warning('asp error on page, skipping %s', bill_id)
            return
        doc = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = doc.xpath(
            u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
        if not title:
            raise NoSuchBill()
        bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
        author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
        for aname in author.split(','):
            aname = self.clean_name(aname).strip()
            if aname:
                bill.add_sponsor('primary', aname)
        co_authors = doc.xpath(
            u'//td/b[contains(text(),"Co-autor")]/../text()')
        if len(co_authors) != 0:
            for co_author in co_authors[1].split(','):
                bill.add_sponsor('cosponsor',
                                 self.clean_name(co_author).strip())
        action_table = doc.xpath('//table')[-1]
        for row in action_table[1:]:
            tds = row.xpath('td')
            # ignore row missing date
            if len(tds) != 2:
                continue
            if tds[0].text_content():
                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
            action = tds[1].text_content().strip()
            #parse the text to see if it's a new version or a unrelated document
            #if has a hyphen let's assume it's a vote document

            #get url of action
            action_url = tds[1].xpath('a/@href')
            atype, action = self.parse_action(chamber, bill, action,
                                              action_url, date)

            # Some lower-house roll calls could be parsed, but finnicky
            # Most roll lists are just images embedded within a document,
            # and offer no alt text to scrape
            # Instead, just scrape the vote counts
            vote_info = re.search(
                r'(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$',
                action)
            if vote_info and re.search(r'\d{1,2}', action):
                vote_name = vote_info.group(1)

                if u"Votación Final" in vote_name:
                    (vote_chamber,
                     vote_name) = re.search(r'(?u)^\w+ por (.*?) en (.*)$',
                                            vote_name).groups()
                    if "Senado" in vote_chamber:
                        vote_chamber = 'upper'
                    else:
                        vote_chamber = 'lower'

                elif "Cuerpo de Origen" in vote_name:
                    vote_name = re.search(r'(?u)^Cuerpo de Origen (.*)$',
                                          vote_name).group(1)
                    vote_chamber = chamber

                elif u"informe de Comisión de Conferencia" in vote_name:
                    (vote_chamber, vote_name) = re.search(
                        r'(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$',
                        vote_name).groups()
                    if vote_chamber == "Senado":
                        vote_chamber = 'upper'
                    else:
                        vote_chamber = 'lower'

                elif u"Se reconsideró" in vote_name:
                    if bill['votes']:
                        vote_chamber = bill['votes'][-1]['chamber']
                    else:
                        vote_chamber = chamber

                else:
                    raise AssertionError(
                        u"Unknown vote text found: {}".format(vote_name))

                vote_name = vote_name.title()

                yes = int(vote_info.group(2))
                no = int(vote_info.group(3))
                other = 0
                if vote_info.group(4).strip():
                    other += int(vote_info.group(4))
                if vote_info.group(5).strip():
                    other += int(vote_info.group(5))

                vote = Vote(chamber=vote_chamber,
                            date=date,
                            motion=vote_name,
                            passed=(yes > no),
                            yes_count=yes,
                            no_count=no,
                            other_count=other)
                vote.add_source(url)
                bill.add_vote(vote)

        bill.add_source(url)
        self.save_bill(bill)
Esempio n. 16
0
    def scrape_bill_sheet(self, session, chamber):
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        with self.urlopen(sheet_url) as sheet_html:
            sheet_page = lxml.html.fromstring(sheet_html)

            bills = sheet_page.xpath('//table/tr')

            for bill in bills:
                bill_id = self.read_td(bill[index["id"]][0])

                if bill_id == None:
                    # Every other entry is null for some reason
                    continue

                bill_id = bill_id[:bill_id.find(".")]
                title_and_sponsor = bill[index["title_sponsor"]][0]

                bill_title = title_and_sponsor.text
                bill_title_and_sponsor = title_and_sponsor.text_content()
                sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                    replace(" & ...", "").split("--")

                bill_history_href = CO_URL_BASE + \
                    bill[index["history"]][0][0].attrib['href']
                # ^^^^^^^ We assume this is a full path to the target.
                # might want to consider some better rel-path support
                # XXX: Look at this ^

                history = self.parse_history(bill_history_href)
                b = Bill(session, bill_chamber, bill_id, bill_title)

                for action in history:
                    self.add_action_to_bill(b, action)

                for sponsor in sponsors:
                    b.add_sponsor("primary", sponsor)

                # Now that we have history, let's see if we can't grab some
                # votes

                bill_vote_href = self.get_vote_url(bill_id, session)
                votes = self.parse_votes(bill_vote_href)

                if votes['sanity-check'] != bill_id:
                    print "XXX: READ ME!"
                    print " -> Scraped ID: " + votes['sanity-check']
                    print " -> 'Real' ID:  " + bill_id
                    assert votes['sanity-check'] == bill_id

                for vote in votes['votes']:
                    print vote
                    filed_votes = vote['votes']
                    passage = vote['meta']
                    result = vote['result']

                    composite_time = "%s %s" % (passage['x-parent-date'],
                                                passage['TIME'])
                    # It's now like: 04/01/2011 02:10:14 PM
                    pydate = dt.datetime.strptime(composite_time,
                                                  "%m/%d/%Y %I:%M:%S %p")
                    hasHouse = "House" in passage['x-parent-ctty']
                    hasSenate = "Senate" in passage['x-parent-ctty']

                    if hasHouse and hasSenate:
                        actor = "legislature"
                    elif hasHouse:
                        actor = "lower"
                    else:
                        actor = "upper"

                    v = Vote(actor,
                             pydate,
                             passage['MOTION'],
                             (result['FINAL_ACTION'] == "YES"),
                             int(result['YES']),
                             int(result['NO']),
                             int(result['EXC'] + result['ABS']),
                             moved=passage['MOVED'],
                             seconded=passage['SECONDED'])
                    # XXX: Add more stuff to kwargs, we have a ton of data
                    for voter in filed_votes:
                        who = voter
                        vote = filed_votes[who]
                        if vote.lower() == "yes":
                            v.yes(who)
                        elif vote.lower() == "no":
                            v.no(who)
                        else:
                            v.other(who)
                    v.add_source(bill_vote_href)
                    b.add_vote(v)
                self.save_bill(b)
Esempio n. 17
0
    def parse_senate_vote(self, url):
        """ senate PDFs -> garbled text -> good text -> Vote """
        vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0)
        vote.add_source(url)

        fname, resp = self.urlretrieve(url)
        # this gives us the cleaned up text
        sv_text = convert_sv_text(convert_pdf(fname, 'text'))
        os.remove(fname)
        in_votes = False

        # use in_votes as a sort of state machine
        for line in sv_text:

            # not 'in_votes', get date or passage
            if not in_votes:
                dmatch = re.search('DATE:(\d{2}-\d{2}-\d{2})', line)
                if dmatch:
                    date = dmatch.groups()[0]
                    vote['date'] = datetime.strptime(date, '%m-%d-%y')

                if 'YES NO ABS EXC' in line:
                    in_votes = True
                elif 'PASSED' in line:
                    vote['passed'] = True

            # in_votes: totals & votes
            else:
                # totals
                if 'TOTALS' in line:

                    # Lt. Governor voted
                    if 'GOVERNOR' in line:
                        name, spaces, line = re.match(' ([A-Z,.]+)(\s+)X(.*)',
                                                      line).groups()
                        if len(spaces) == 1:
                            vote.yes(name)
                        else:
                            vote.no(name)

                    _, yes, no, abs, exc = line.split()
                    vote['yes_count'] = int(yes)
                    vote['no_count'] = int(no)
                    vote['other_count'] = int(abs) + int(exc)
                    # no longer in votes
                    in_votes = False
                    continue

                # pull votes out
                matches = re.match(' ([A-Z,.]+)(\s+)X\s+([A-Z,.]+)(\s+)X',
                                   line).groups()
                name1, spaces1, name2, spaces2 = matches

                # vote can be determined by # of spaces
                if len(spaces1) == 1:
                    vote.yes(name1)
                elif len(spaces1) == 2:
                    vote.no(name1)
                else:
                    vote.other(name1)

                if len(spaces2) == 1:
                    vote.yes(name2)
                elif len(spaces2) == 2:
                    vote.no(name2)
                else:
                    vote.other(name2)
        return vote
Esempio n. 18
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for current year
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.get(url).text
        # if first page isn't found, try second year
        if ('Page Not Found' in html or
                'The bill you are looking for is not available yet' in html):
            html = self.get('http://legislature.mi.gov/doc.aspx?%s-%s' %
                            (session[-4:], bill_id.replace(' ', '-'))).text
            if ('Page Not Found' in html
                    or 'The bill you are looking for is not available yet'
                    in html):
                return None

        doc = lxml.html.fromstring(html)

        title = doc.xpath(
            '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(session=session,
                    chamber=chamber,
                    bill_id=bill_id,
                    title=title,
                    type=bill_type)
        bill.add_source(url)

        # sponsors
        sp_type = 'primary'
        for sponsor in doc.xpath(
                '//span[@id="frg_billstatus_SponsorList"]/a/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ')
            bill.add_sponsor(sp_type, sponsor)
            sp_type = 'cosponsor'

        bill['subjects'] = doc.xpath(
            '//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath(
                '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = datetime.datetime.strptime(date, "%m/%d/%Y")
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            type = categorize_action(action)
            bill.add_action(actor, action, date, type=type)

            # check if action mentions a vote
            rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    vote = Vote(actor, date, action, False, 0, 0, 0)
                    self.parse_roll_call(vote, vote_url, rc_num)

                    # check the expected counts vs actual
                    count = re.search('YEAS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['yes_votes']):
                        self.warning(
                            'vote count mismatch for %s %s, %d != %d' %
                            (bill_id, action, count, len(vote['yes_votes'])))
                    count = re.search('NAYS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['no_votes']):
                        self.warning(
                            'vote count mismatch for %s %s, %d != %d' %
                            (bill_id, action, count, len(vote['no_votes'])))

                    vote['yes_count'] = len(vote['yes_votes'])
                    vote['no_count'] = len(vote['no_votes'])
                    vote['other_count'] = len(vote['other_votes'])
                    vote['passed'] = vote['yes_count'] > vote['no_count']
                    vote.add_source(vote_url)
                    bill.add_vote(vote)
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath(
                '//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            version = self.parse_doc_row(row)
            if version:
                if version[1].endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif version[1].endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version(*version, mimetype=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)

        self.save_bill(bill)
        return True
Esempio n. 19
0
    def _parse_votes(self, url, vote):
        '''Given a vote url and a vote object, extract the voters and
        the vote counts from the vote page and update the vote object.
        '''
        if url.lower().endswith('.pdf'):

            try:
                resp = self.get(url)
            except HTTPError:
                # This vote document wasn't found.
                msg = 'No document found at url %r' % url
                self.logger.warning(msg)
                return

            try:
                v = PDFCommitteeVote(url, resp.content)
                return v.asvote()
            except PDFCommitteeVoteParseError as e:
                # Warn and skip.
                self.warning("Could't parse committee vote at %r" % url)
                return

        keymap = {'Y': 'yes', 'N': 'no'}
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        # Yes, no, excused, absent.
        try:
            vals = doc.xpath('//table')[1].xpath('tr/td/text()')
        except IndexError:
            # Most likely was a bogus link lacking vote data.
            return

        y, n, e, a = map(int, vals)
        vote.update(yes_count=y, no_count=n, other_count=e + a)

        # Get the motion.
        try:
            motion = doc.xpath('//br')[-1].tail.strip()
        except:
            # Some of them mysteriously have no motion listed.
            motion = vote['action']

        if not motion:
            motion = vote['action']

        vote['motion'] = motion

        # Add placeholder for passed (see below)
        vote['passed'] = False

        vote = Vote(**vote)

        for text in doc.xpath('//table')[2].xpath('tr/td/text()'):
            if not text.strip(u'\xa0'):
                continue
            v, name = filter(None, text.split(u'\xa0'))
            getattr(vote, keymap.get(v, 'other'))(name)

        action = vote['action']

        # Existing code to deterimine value of `passed`
        yes_votes = vote['yes_votes']
        no_votes = vote['no_votes']
        passed = None

        # some actions take a super majority, so we aren't just
        # comparing the yeas and nays here.
        for i in vote_passage_indicators:
            if action.count(i):
                passed = True
        for i in vote_failure_indicators:
            if action.count(i) and passed == True:
                # a quick explanation:  originally an exception was
                # thrown if both passage and failure indicators were
                # present because I thought that would be a bug in my
                # lists.  Then I found 2007 HB 160.
                # Now passed = False if the nays outnumber the yays..
                # I won't automatically mark it as passed if the yays
                # ounumber the nays because I don't know what requires
                # a supermajority in MT.
                if no_votes >= yes_votes:
                    passed = False
                else:
                    raise Exception("passage and failure indicator"
                                    "both present at: %s" % url)
            if action.count(i) and passed == None:
                passed = False
        for i in vote_ambiguous_indicators:
            if action.count(i):
                passed = yes_votes > no_votes
        if passed is None:
            raise Exception("Unknown passage at: %s" % url)

        vote['passed'] = passed

        return vote
Esempio n. 20
0
    def scrape_pdf_for_votes(self, session, chamber, date, motion, href):
        warned = False
        # vote indicator, a few spaces, a name, newline or multiple spaces
        VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})')
        COUNT_RE = re.compile(r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$')
        PASS_FAIL_WORDS = {
            'PASSED': True,
            'PREVAILED': True,
            'ADOPTED': True,
            'CONCURRED': True,
            'FAILED': False,
            'LOST': False,
        }

        pdflines = self.fetch_pdf_lines(href)

        yes_count = no_count = present_count = other_count = 0
        yes_votes = []
        no_votes = []
        present_votes = []
        other_vote_detail = defaultdict(list)
        passed = None
        counts_found = False
        vote_lines = []
        for line in pdflines:
            # consider pass/fail as a document property instead of a result of the vote count
            # extract the vote count from the document instead of just using counts of names
            if not line.strip():
                continue
            elif line.strip() in PASS_FAIL_WORDS:
                if passed is not None:
                    raise Exception("Duplicate pass/fail matches in [%s]" % href)
                passed = PASS_FAIL_WORDS[line.strip()]
            elif COUNT_RE.match(line):
                yes_count, no_count, present_count, not_voting_count = COUNT_RE.match(line).groups()
                yes_count = int(yes_count)
                no_count = int(no_count)
                present_count = int(present_count)
                counts_found = True
            elif counts_found:
                for value in VOTE_VALUES:
                    if re.search(r'^\s*({})\s+\w'.format(value), line):
                        vote_lines.append(line)
                        break

        votes = find_columns_and_parse(vote_lines)
        for name, vcode in votes.items():
            if name == 'Mr. Speaker':
                name = self.metadata['session_details'][session]['speaker']
            elif name == 'Mr. President':
                name = self.metadata['session_details'][session]['president']
            if vcode == 'Y':
                yes_votes.append(name)
            elif vcode == 'N':
                no_votes.append(name)
            else:
                other_vote_detail[vcode].append(name)
                other_count += 1
                if vcode == 'P':
                    present_votes.append(name)
        # fake the counts
        if yes_count == 0 and no_count == 0 and present_count == 0:
            yes_count = len(yes_votes)
            no_count = len(no_votes)
        else:  # audit
            if yes_count != len(yes_votes):
                self.warning("Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes)))
                warned = True
            if no_count != len(no_votes):
                self.warning("Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes)))
                warned = True
            if present_count != len(present_votes):
                self.warning("Mismatched present count [expect: %i] [have: %i]" % (present_count, len(present_votes)))
                warned = True

        if passed is None:
            if chamber == 'lower':  # senate doesn't have these lines
                self.warning("No pass/fail word found; fall back to comparing yes and no vote.")
                warned = True
            passed = yes_count > no_count
        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count, other_vote_detail=other_vote_detail)
        for name in yes_votes:
            vote.yes(name)
        for name in no_votes:
            vote.no(name)
        for other_type, names in other_vote_detail.iteritems():
            for name in names:
                vote.other(name)
        vote.add_source(href)

        if warned:
            self.warning("Warnings were issued. Best to check %s" % href)
        return vote
Esempio n. 21
0
    def scrape(self, chamber, session):
        if session == '2009':
            # 2009 files have a different delimiter and naming scheme.
            vote_data_url = 'ftp://www.ncga.state.nc.us/Bill_Status/Vote Data 2009.zip'
            naming_scheme = '{session}{file_label}.txt'
            delimiter = ";"
        else:
            vote_data_url = 'ftp://www.ncga.state.nc.us/Bill_Status/Votes%s.zip' % session
            naming_scheme = '{file_label}_{session}.txt'
            delimiter = "\t"
        fname, resp = self.urlretrieve(vote_data_url)
        # fname = "/Users/brian/Downloads/Vote Data 2009.zip"
        zf = ZipFile(fname)

        chamber_code = 'H' if chamber == 'lower' else 'S'

        # Members_YYYY.txt: tab separated
        # 0: id (unique only in chamber)
        # 1: H or S
        # 2: member name
        # 3-5: county, district, party
        # 6: mmUserId
        member_file = zf.open(naming_scheme.format(file_label='Members', session=session))
        members = {}
        for line in member_file.readlines():
            data = line.split(delimiter)
            if data[1] == chamber_code:
                members[data[0]] = data[2]

        # Votes_YYYY.txt
        # 0: sequence number
        # 1: chamber (S/H)
        # 2: date
        # 3: prefix
        # 4: bill_id
        # 5: yes votes
        # 6: no votes
        # 7: excused absences
        # 8: excused votes
        # 9: didn't votes
        # 10: total yes+no
        # 11: sponsor
        # 12: reading info
        # 13: info
        # 20: PASSED/FAILED
        # 21: legislative day
        vote_file = zf.open(naming_scheme.format(file_label='Votes', session=session))
        bill_chambers = {'H':'lower', 'S':'upper'}
        votes = {}
        for line in vote_file.readlines():
            data = line.split(delimiter)
            if data[1] == chamber_code:
                date = datetime.datetime.strptime(data[2][:16],
                                                  '%Y-%m-%d %H:%M')
                if data[3][0] not in bill_chambers:
                    # skip votes that aren't on bills
                    self.log('skipping vote %s' % data[0])
                    continue

                votes[data[0]] = Vote(chamber, date, data[13],
                                      'PASS' in data[20],
                                      int(data[5]),
                                      int(data[6]),
                                      int(data[7])+int(data[8])+int(data[9]),
                                      bill_chamber=bill_chambers[data[3][0]],
                                      bill_id=data[3]+data[4], session=session)

        member_vote_file = zf.open(naming_scheme.format(file_label='MemberVotes', session=session))
        # 0: member id
        # 1: chamber (S/H)
        # 2: vote id
        # 3: vote chamber (always same as 1)
        # 4: vote (Y,N,E,X)
        # 5: pair ID (member)
        # 6: pair order
        # If a vote is paired then it should be counted as an 'other'
        for line in member_vote_file.readlines():
            data = line.split(delimiter)
            if data[1] == chamber_code:
                try:
                    member_voting = members[data[0]]
                except KeyError:
                    self.debug('Member %s not found.' % data[0])
                    continue
                try:
                    vote = votes[data[2]]
                except KeyError:
                    self.debug('Vote %s not found.' % data[2])
                    continue

                # -1 votes are Lt. Gov, not included in count, so we add them
                if data[4] == 'Y' and not data[5]:
                    if data[0] == '-1':
                        vote['yes_count'] += 1
                    vote.yes(member_voting)
                elif data[4] == 'N' and not data[5]:
                    if data[0] == '-1':
                        vote['no_count'] += 1
                    vote.no(member_voting)
                else:
                    # for some reason other_count is high for paired votes
                    if data[5]:
                        vote['other_count'] -= 1
                    # is either E: excused, X: no vote, or paired (doesn't count)
                    vote.other(member_voting)

        for vote in votes.itervalues():
            vote.validate()
            vote.add_source(vote_data_url)
            self.save_vote(vote)

        # remove file
        zf.close()
        os.remove(fname)
Esempio n. 22
0
    def scrape_votes(self, url, chamb):
        with self.urlopen(url) as doc:
            soup = BeautifulSoup(doc)
            date = None
            motion = None
            yeas = None
            neas = None
            others = None
            passed = None
            chamber = chamb
            necessary = None
            vote = None

            fonts = soup.findAll('font')
            span = soup.findAll('span')
            if (len(fonts) + (len(span))) > 4:  #data is vaguely structured
                if (len(fonts) < 4):
                    fonts = span
                for line in fonts:
                    #this could be sped up.
                    line = str(line.contents[0])
                    line = line.strip()
                    if line.find("Taken on") > -1:
                        #then the text is in the form of: "Take on <date> <reason>"
                        split = line.split(None, 3)
                        date = split[2]
                        if (len(split) > 3):
                            motion = split[3]
                    elif line.find("Those voting Yea") > -1:
                        yeas = self.get_num_from_line(line)
                    elif line.find("Those voting Nay") > -1:
                        neas = self.get_num_from_line(line)
                    elif line.find("Those absent and not voting") > -1:
                        others = self.get_num_from_line(line)
                    elif (line.find("Necessary for Adoption") >
                          -1) or (line.find("Necessary for Passage") > -1):
                        necessary = self.get_num_from_line(line)
                if yeas >= necessary:
                    passed = True
                else:
                    passed = False
                vote = Vote(chamber, date, motion, passed, yeas, neas, others)

                #figure out who voted for what
                table = soup.findAll('table')
                tds = table[len(table) - 1].findAll('td')  #get the last table

                vote_value = None
                digits = re.compile('^[\d ]+$')
                for cell in tds:
                    string = cell.find('font')
                    if (string == None):
                        string = cell.find(
                            'span')  #either we are looking at fonts or spans
                    if (string != None):
                        string = string.contents[0]
                        string = string.strip()
                    else:
                        string = ''
                    if (len(string) > 0) and (digits.search(string) == None):
                        if vote_value == None:
                            if (string == 'Y') or (string == 'N'):
                                vote_value = string
                            elif (string == 'X') or (string == 'A'):
                                vote_value = 'X'
                        else:
                            if vote_value == 'Y':
                                vote.yes(string)
                            elif vote_value == 'N':
                                vote.no(string)
                            else:
                                vote.other(string)
                            vote_value = None

            else:
                #data is mostly unstructured. Have to sift through a string
                data = soup.find('pre')
                lines = data.contents[len(data.contents) - 1]
                lines = lines.strip()
                exp = re.compile(r'\n+|\r+|\f+')
                lines = exp.split(lines)
                names = []
                for i in range(len(lines)):
                    line = lines[i].strip()
                    if line.find("Taken on") > -1:
                        #then the text is in the form of: "Take on <date> <reason>"
                        split = line.split(None, 3)
                        date = split[2]
                        if (len(split) > 3):
                            motion = split[3]
                    elif line.find("Those voting Yea") > -1:
                        yeas = self.get_num_from_line(line)
                    elif line.find("Those voting Nay") > -1:
                        neas = self.get_num_from_line(line)
                    elif line.find("Those absent and not voting") > -1:
                        others = self.get_num_from_line(line)
                    elif (line.find("Necessary for Adoption") >
                          -1) or (line.find("Necessary for Passage") > -1):
                        if (line.find("Adoption") > -1):
                            motion = "Adoption"
                        else:
                            motion = "Passage"
                        necessary = self.get_num_from_line(line)
                    elif (line.find("The following is the roll call vote:") >
                          -1):
                        break  #the next lines contain actual votes
                #process the vote values
                if yeas >= necessary:
                    passed = True
                else:
                    passed = False
                vote = Vote(chamber, date, motion, passed, yeas, neas, others)
                lines = lines[i + 1:]
                lines = string.join(lines, '  ')
                lines = lines.split('  ')
                absent_vote_value = re.compile('^(X|A)$')
                yea_vote_value = re.compile('^Y$')
                nea_vote_value = re.compile('^N$')
                #there aren't two spaces between vote and name so it doesn't get parsed
                annoying_vote = re.compile('^(Y|X|A|N) ([\S ]+)$')
                digits = re.compile('^[\d ]+$')
                vote_value = None
                for word in lines:
                    word = word.strip()
                    if (len(word) > 0) and (digits.search(word) == None):
                        word = strip_digits(word)
                        if vote_value != None:
                            if vote_value == 'Y':
                                vote.yes(word)
                            elif vote_value == 'N':
                                vote.no(word)
                            else:
                                vote.other(word)
                            vote_value = None
                        elif absent_vote_value.match(word) != None:
                            vote_value = 'X'
                        elif yea_vote_value.match(word) != None:
                            vote_value = 'Y'
                        elif nea_vote_value.match(word) != None:
                            vote_value = 'N'
                        elif annoying_vote.match(word) != None:
                            split = annoying_vote.match(word)
                            vote_value = split.group(2)
                            name = split.group(1)
                            if vote_value == 'Y':
                                vote.yes(name)
                            elif vote_value == 'N':
                                vote.no(name)
                            else:
                                vote.other(name)
                            vote_value = None
            return vote
Esempio n. 23
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        sheet_html = self.get(sheet_url).text
        sheet_page = lxml.html.fromstring(sheet_html)
        sheet_page.make_links_absolute(sheet_url)

        bills = sheet_page.xpath('//table/tr')

        for bill in bills:
            bill_id = self.read_td(bill[index["id"]][0])

            if bill_id == None:
                # Every other entry is null for some reason
                continue

            dot_loc = bill_id.find('.')
            if dot_loc != -1:
                # budget bills are missing the .pdf, don't truncate
                bill_id = bill_id[:dot_loc]
            title_and_sponsor = bill[index["title_sponsor"]][0]

            bill_title = title_and_sponsor.text
            bill_title_and_sponsor = title_and_sponsor.text_content()
            if bill_title is None:
                continue  # Odd ...

            sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                replace(" & ...", "").split("--")

            cats = {
                "SB": "bill",
                "HB": "bill",
                "HR": "resolution",
                "SR": "resolution",
                "SCR": "concurrent resolution",
                "HCR": "concurrent resolution",
                "SJR": "joint resolution",
                "HJR": "joint resolution",
                "SM": "memorial",
                "HM": "memorial"
            }

            bill_type = None

            for cat in cats:
                if bill_id[:len(cat)] == cat:
                    bill_type = cats[cat]

            b = Bill(session,
                     bill_chamber,
                     bill_id,
                     bill_title,
                     type=bill_type)

            b.add_source(sheet_url)

            versions_url = \
                bill[index["version"]].xpath('font/a')[0].attrib["href"]
            versions_url = versions_url
            versions = self.parse_versions(versions_url)

            for version in versions:
                b.add_version(version['name'],
                              version['link'],
                              mimetype=version['mimetype'])

            bill_history_href = bill[index["history"]][0][0].attrib['href']

            history = self.parse_history(bill_history_href)
            if history is None:
                self.logger.warning(
                    "Bill history for %s is not correctly formatted" % bill_id)
                continue
            b.add_source(bill_history_href)

            chamber_map = dict(Senate='upper', House='lower')
            for action, date in history:
                action_actor = chamber_map.get(chamber, chamber)
                attrs = dict(actor=action_actor, action=action, date=date)
                attrs.update(self.categorizer.categorize(action))
                b.add_action(**attrs)

            for sponsor in sponsors:
                if sponsor != None and sponsor != "(NONE)" and \
                   sponsor != "":
                    if "&" in sponsor:
                        for sponsor in [x.strip() for x in sponsor.split("&")]:
                            b.add_sponsor("primary", sponsor)
                    else:
                        b.add_sponsor("primary", sponsor)

            # Now that we have history, let's see if we can't grab some
            # votes

            bill_vote_href, = bill.xpath(".//a[contains(text(), 'Votes')]")
            bill_vote_href = bill_vote_href.attrib['href']
            #bill_vote_href = self.get_vote_url(bill_id, session)
            votes = self.parse_votes(bill_vote_href)

            if (votes['sanity-check'] == 'This site only supports frames '
                    'compatible browsers!'):
                votes['votes'] = []
            elif votes['sanity-check'] != bill_id:
                self.warning("XXX: READ ME! Sanity check failed!")
                self.warning(" -> Scraped ID: " + votes['sanity-check'])
                self.warning(" -> 'Real' ID:  " + bill_id)
                assert votes['sanity-check'] == bill_id

            for vote in votes['votes']:
                filed_votes = vote['votes']
                passage = vote['meta']
                result = vote['result']

                composite_time = "%s %s" % (passage['x-parent-date'],
                                            passage['TIME'])
                # It's now like: 04/01/2011 02:10:14 PM
                pydate = dt.datetime.strptime(composite_time,
                                              "%m/%d/%Y %I:%M:%S %p")
                hasHouse = "House" in passage['x-parent-ctty']
                hasSenate = "Senate" in passage['x-parent-ctty']

                if hasHouse and hasSenate:
                    actor = "joint"
                elif hasHouse:
                    actor = "lower"
                else:
                    actor = "upper"

                other = (int(result['EXC']) + int(result['ABS']))
                # OK, sometimes the Other count is wrong.
                local_other = 0
                for voter in filed_votes:
                    l_vote = filed_votes[voter].lower().strip()
                    if l_vote != "yes" and l_vote != "no":
                        local_other = local_other + 1

                if local_other != other:
                    self.warning( \
                        "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES")
                    self.warning(" -> Old: %s // New: %s" %
                                 (other, local_other))
                    other = local_other

                passed = (result['FINAL_ACTION'] == "PASS")
                if passage['MOTION'].strip() == "":
                    continue

                if "without objection" in passage['MOTION'].lower():
                    passed = True

                v = Vote(actor,
                         pydate,
                         passage['MOTION'],
                         passed,
                         int(result['YES']),
                         int(result['NO']),
                         other,
                         moved=passage['MOVED'],
                         seconded=passage['SECONDED'])

                v.add_source(vote['meta']['url'])
                # v.add_source( bill_vote_href )

                # XXX: Add more stuff to kwargs, we have a ton of data
                seen = set([])
                for voter in filed_votes:
                    who = voter
                    if who in seen:
                        raise Exception("Seeing the double-thing. - bug #702")
                    seen.add(who)

                    vote = filed_votes[who]
                    if vote.lower() == "yes":
                        v.yes(who)
                    elif vote.lower() == "no":
                        v.no(who)
                    else:
                        v.other(who)
                b.add_vote(v)
            self.save_bill(b)
Esempio n. 24
0
    def parse_bill_votes(self, doc, bill):
        params = {
            'chamber': None,
            'date': None,
            'motion': None,
            'passed': None,
            'yes_count': None,
            'no_count': None,
            'other_count': None,
        }
        elems = doc.cssselect('a')

        # MD has a habit of listing votes twice
        seen_votes = set()

        for elem in elems:
            href = elem.get('href')
            if (href and "votes" in href and href.endswith('htm') and 
                href not in seen_votes):
                seen_votes.add(href)
                vote_url = BASE_URL + href
                with self.urlopen(vote_url) as vote_html:
                    vote_doc = lxml.html.fromstring(vote_html)

                    # motion
                    box = vote_doc.xpath('//td[@colspan=3]/font[@size=-1]/text()')
                    params['motion'] = box[-1]
                    params['type'] = 'other'
                    if 'senate' in href:
                        params['chamber'] = 'upper'
                    else:
                        params['chamber'] = 'lower'
                    for regex, vtype in vote_classifiers.iteritems():
                        if re.findall(regex, params['motion'], re.IGNORECASE):
                            params['type'] = vtype

                    # counts
                    bs = vote_doc.xpath('//td[@width="20%"]/font/b/text()')
                    yeas = int(bs[0].split()[0])
                    nays = int(bs[1].split()[0])
                    excused = int(bs[2].split()[0])
                    not_voting = int(bs[3].split()[0])
                    absent = int(bs[4].split()[0])
                    params['yes_count'] = yeas
                    params['no_count'] = nays
                    params['other_count'] = excused + not_voting + absent
                    params['passed'] = yeas > nays

                    # date
                    # parse the following format: March 23, 2009
                    date_elem = vote_doc.xpath('//font[starts-with(text(), "Legislative Date")]')[0]
                    params['date'] = datetime.datetime.strptime(date_elem.text[18:], '%B %d, %Y')

                    vote = Vote(**params)

                    status = None
                    for row in vote_doc.cssselect('table')[3].cssselect('tr'):
                        text = row.text_content()
                        if text.startswith('Voting Yea'):
                            status = 'yes'
                        elif text.startswith('Voting Nay'):
                            status = 'no'
                        elif text.startswith('Not Voting') or text.startswith('Excused'):
                            status = 'other'
                        else:
                            for cell in row.cssselect('a'):
                                getattr(vote, status)(cell.text.strip())

                    vote.add_source(vote_url)
                    bill.add_vote(vote)
Esempio n. 25
0
    def scrape_senate(self, session):
        url = journals % (session, 'Senate')
        page = self.lxmlize(url)
        hrefs = page.xpath("//font//a")

        for href in hrefs:
            (path, response) = self.urlretrieve(href.attrib['href'])
            data = convert_pdf(path, type='text')

            cur_bill_id = None
            cur_vote_count = None
            in_vote = False
            cur_question = None
            in_question = False
            known_date = None
            cur_vote = {}

            for line in data.split("\n"):
                if not known_date:
                    dt = date_re.findall(line)
                    if dt != []:
                        dt, dow = dt[0]
                        known_date = datetime.datetime.strptime(
                            dt, "%A, %B %d, %Y")

                if in_question:
                    line = line.strip()
                    if re.match("\d+", line):
                        in_question = False
                        continue
                    try:
                        line, _ = line.rsplit(" ", 1)
                        cur_question += line
                    except ValueError:
                        in_question = False
                        continue

                    cur_question += line
                if not in_vote:
                    summ = vote_re.findall(line)
                    if summ != []:
                        cur_vote = {}
                        cur_vote_count = summ[0]
                        in_vote = True
                        continue

                    if ("The question being" in line) or \
                       ("On motion of" in line) or \
                       ("the following" in line) or \
                       ("moved that the" in line):
                        cur_question, _ = line.strip().rsplit(" ", 1)
                        in_question = True

                    if line.strip() == "":
                        continue
                    first = line[0]
                    if first != " ":
                        if " " not in line:
                            # wtf
                            continue

                        bill_id, kruft = line.split(" ", 1)
                        if len(bill_id) < 3:
                            continue
                        if bill_id[0] != "H" and bill_id[0] != "S":
                            continue
                        if bill_id[1] not in ['B', 'J', 'R', 'M']:
                            continue

                        cur_bill_id = bill_id
                else:
                    line = line.strip()
                    try:
                        line, lineno = line.rsplit(" ", 1)
                    except ValueError:
                        in_vote = False
                        if cur_question is None:
                            continue

                        if cur_bill_id is None:
                            continue

                        yes, no, exc, ab = cur_vote_count
                        other = int(exc) + int(ab)
                        yes, no, other = int(yes), int(no), int(other)

                        bc = {'H': 'lower', 'S': 'upper'}[cur_bill_id[0]]

                        vote = Vote('upper',
                                    known_date,
                                    cur_question, (yes > no),
                                    yes,
                                    no,
                                    other,
                                    session=session,
                                    bill_id=cur_bill_id,
                                    bill_chamber=bc)
                        for person in cur_vote:
                            if person is None:
                                continue

                            howvote = cur_vote[person]
                            howvote = howvote.upper()
                            if howvote == 'Y':
                                vote.yes(person)
                            elif howvote == 'N':
                                vote.no(person)
                            else:
                                vote.other(person)
                        vote.add_source(href.attrib['href'])
                        self.save_vote(vote)

                        cur_vote, cur_question, cur_vote_count = (None, None,
                                                                  None)
                        continue

                    votes = re.findall(votes_re, line)

                    for person in votes:
                        name, li, vot = person
                        cur_vote[name] = vot

            os.unlink(path)
Esempio n. 26
0
    def parse_old_vote_page(self, vote_url):
        params = {
            'chamber': None,
            'date': None,
            'motion': None,
            'passed': None,
            'yes_count': None,
            'no_count': None,
            'other_count': None,
        }

        with self.urlopen(vote_url) as vote_html:
            vote_doc = lxml.html.fromstring(vote_html)

            # motion
            box = vote_doc.xpath('//td[@colspan=3]/font[@size=-1]/text()')
            params['motion'] = box[-1]
            params['type'] = 'other'
            if 'senate' in vote_url:
                params['chamber'] = 'upper'
            else:
                params['chamber'] = 'lower'
            for regex, vtype in vote_classifiers.iteritems():
                if re.findall(regex, params['motion'], re.IGNORECASE):
                    params['type'] = vtype

            # counts
            bs = vote_doc.xpath('//td[@width="20%"]/font/b/text()')
            yeas = int(bs[0].split()[0])
            nays = int(bs[1].split()[0])
            excused = int(bs[2].split()[0])
            not_voting = int(bs[3].split()[0])
            absent = int(bs[4].split()[0])
            params['yes_count'] = yeas
            params['no_count'] = nays
            params['other_count'] = excused + not_voting + absent
            params['passed'] = yeas > nays

            # date
            # parse the following format: March 23, 2009
            date_elem = vote_doc.xpath(
                '//font[starts-with(text(), "Legislative Date")]')[0]
            params['date'] = datetime.datetime.strptime(
                date_elem.text[18:], '%B %d, %Y')

            vote = Vote(**params)

            status = None
            for row in vote_doc.xpath('//table')[3].xpath('tr'):
                text = row.text_content()
                if text.startswith('Voting Yea'):
                    status = 'yes'
                elif text.startswith('Voting Nay'):
                    status = 'no'
                elif text.startswith('Not Voting') or text.startswith(
                        'Excused'):
                    status = 'other'
                else:
                    for cell in row.xpath('a'):
                        getattr(vote, status)(cell.text.strip())
        return vote
Esempio n. 27
0
def viva_voce_votes(root, session):
    prev_id = None
    for el in root.xpath(u'//div[starts-with(., "All Members are deemed")]'):
        text = ''.join(el.getprevious().getprevious().itertext())
        text.replace('\n', ' ')
        m = re.search(
            r'(?P<bill_id>\w+\W+\d+)(,\W+as\W+amended,)?\W+was\W+'
            '(?P<type>adopted|passed'
            '(\W+to\W+(?P<to>engrossment|third\W+reading))?)\W+'
            'by\W+a\W+viva\W+voce\W+vote', text)
        if m:
            motion = get_motion(m)

            # No identifier, generate our own
            record = str(uuid.uuid1())

            bill_id = m.group('bill_id')
            bill_id = bill_id.replace(u'\xa0', ' ')
            bill_id = re.sub(r'CS(SB|HB)', r'\1', bill_id)

            if bill_id.startswith('H') or bill_id.startswith('CSHB'):
                bill_chamber = 'lower'
            elif bill_id.startswith('S') or bill_id.startswith('CSSB'):
                bill_chamber = 'upper'
            else:
                continue

            vote = Vote(None, None, motion, True, 0, 0, 0)
            vote['bill_id'] = bill_id
            vote['bill_chamber'] = bill_chamber
            vote['session'] = session[0:2]
            vote['method'] = 'viva voce'
            vote['record'] = record
            vote['type'] = get_type(motion)
            yield vote
            continue

        m = re.search(
            'The\W+bill\W+was.+and\W+was\W+'
            '(?P<type>adopted|passed'
            '(\W+to\W+(?P<to>engrossment|third\W+reading))?)\W+'
            'by\W+a\W+viva\W+voce\W+vote', text)
        if m:
            prev_text = ''.join(el.getprevious().getprevious().itertext())
            m2 = re.match('(HB|SB|CSHB|CSSB|HR|SR)\W+\d+', prev_text)
            if m2:
                bill_id = m2.group()
                prev_id = bill_id
            else:
                # This is scary
                bill_id = prev_id

            if not bill_id:
                continue

            if bill_id.startswith('H') or bill_id.startswith('CSHB'):
                bill_chamber = 'lower'
            elif bill_id.startswith('S') or bill_id.startswith('CSSB'):
                bill_chamber = 'upper'
            else:
                continue

            bill_id = bill_id.replace(u'\xa0', ' ')
            motion = get_motion(m)

            record = str(uuid.uuid1())
            vote = Vote(None, None, motion, True, 0, 0, 0)
            vote['bill_id'] = bill_id
            vote['bill_chamber'] = bill_chamber
            vote['session'] = session[0:2]
            vote['method'] = 'viva voce'
            vote['record'] = record
            vote['type'] = get_type(motion)

            yield vote
            continue
Esempio n. 28
0
    def get_senate_votes(self):
        for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"):
            date = b.text.split('-')[1].strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0
            actual_vote = collections.defaultdict(list)

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count = int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count = int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused') or
                          text.startswith('Abstain') or
                          text.startswith('Absent')
                         ):
                        vtype = 'other'
                        other_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append((name, tag.text))

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name, vote_val in other_votes:
                vote.other(name)
                actual_vote[vote_val].append(name)

            vote['actual_vote'] = actual_vote
            vote.add_source(self.url)
            self.bill.add_vote(vote)

        for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: COMMITTEE VOTE:')]"):
            _, committee, date = re.split(r'\s*\t+\s*-\s*', b.text)
            date = date.strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused') or
                          text.startswith('Abstain') or
                          text.startswith('Absent')
                         ):
                        vtype = 'other'
                        other_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append(name)

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, '%s Committee Vote' % committee,
                        passed, yes_count, no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name in other_votes:
                vote.other(name)

            vote.add_source(self.url)
            self.bill.add_vote(vote)
Esempio n. 29
0
    def scrape_votes(self, bill, link):
        page = self.urlopen(link)
        page = lxml.html.fromstring(page)
        raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content()
        raw_vote_data = re.split('\w+? by [\w ]+?\s+-', raw_vote_data.strip())[1:]
        for raw_vote in raw_vote_data:
            raw_vote = raw_vote.split(u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0')
            motion = raw_vote[0]

            vote_date = re.search('(\d+/\d+/\d+)', motion)
            if vote_date:
                vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y')

            passed = ('Passed' in motion or
                      'Recommended for passage' in motion or
                      'Adopted' in raw_vote[1]
                     )
            vote_regex = re.compile('\d+$')
            aye_regex = re.compile('^.+voting aye were: (.+) -')
            no_regex = re.compile('^.+voting no were: (.+) -')
            other_regex = re.compile('^.+present and not voting were: (.+) -')
            yes_count = 0
            no_count = 0
            other_count = 0
            ayes = []
            nos = []
            others = []

            for v in raw_vote[1:]:
                v = v.strip()
                if v.startswith('Ayes...') and vote_regex.search(v):
                    yes_count = int(vote_regex.search(v).group())
                elif v.startswith('Noes...') and vote_regex.search(v):
                    no_count = int(vote_regex.search(v).group())
                elif v.startswith('Present and not voting...') and vote_regex.search(v):
                    other_count += int(vote_regex.search(v).group())
                elif aye_regex.search(v):
                    ayes = aye_regex.search(v).groups()[0].split(', ')
                elif no_regex.search(v):
                    nos = no_regex.search(v).groups()[0].split(', ')
                elif other_regex.search(v):
                    others += other_regex.search(v).groups()[0].split(', ')

            if 'ChamberVoting=H' in link:
                chamber = 'lower'
            else:
                chamber = 'upper'

            vote = Vote(chamber, vote_date, motion, passed, yes_count,
                        no_count, other_count)
            vote.add_source(link)

            seen = set()
            for a in ayes:
                if a in seen:
                    continue
                vote.yes(a)
                seen.add(a)
            for n in nos:
                if n in seen:
                    continue
                vote.no(n)
                seen.add(n)
            for o in others:
                if o in seen:
                    continue
                vote.other(o)
                seen.add(o)

            # vote.validate()
            bill.add_vote(vote)

        return bill
Esempio n. 30
0
    def scrape_bill_type(self,
                         chamber,
                         session,
                         bill_type,
                         type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):

        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version(bill_id, source_url, 'text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type_.append('fiscal committee')
                if version.local_program == 'Yes':
                    type_.append('local program')
                if version.urgency == 'Yes':
                    type_.append('urgency')
                if version.taxlevy == 'Yes':
                    type_.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['summary'] = summary
            fsbill['type'] = type_
            fsbill['subjects'] = filter(None, [subject])
            fsbill['impact_clause'] = impact_clause

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            fsbill['alternate_titles'] = list(all_titles)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(SPONSOR_TYPES[author.contribution],
                                       author.name,
                                       official_type=author.contribution)

            introduced = False
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'other'
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                'Assembly': 'lower',
                                'Senate': 'upper'
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                if act_str.startswith('Introduced'):
                    introduced = True
                    type_.append('bill:introduced')

                if 'Read first time.' in act_str:
                    if not introduced:
                        type_.append('bill:introduced')
                        introduced = True
                    type_.append('bill:reading:1')

                if 'To Com' in act_str or 'referred to' in act_str.lower():
                    type_.append('committee:referred')

                if 'Read third time.  Passed' in act_str:
                    type_.append('bill:passed')

                if 'Read third time. Passed' in act_str:
                    type_.append('bill:passed')

                if 'Read third time, passed' in act_str:
                    type_.append('bill:passed')

                if re.search(r'Read third time.+?Passed and', act_str):
                    type_.append('bill:passed')

                if 'Approved by Governor' in act_str:
                    type_.append('governor:signed')

                if 'Approved by the Governor' in act_str:
                    type_.append('governor:signed')

                if 'Item veto' in act_str:
                    type_.append('governor:vetoed:line-item')

                if 'Vetoed by Governor' in act_str:
                    type_.append('governor:vetoed')

                if 'To Governor' in act_str:
                    type_.append('governor:received')

                if 'Read second time' in act_str:
                    type_.append('bill:reading:2')

                if not type_:
                    type_ = ['other']

                # Add in the committee strings of the related committees, if any.
                kwargs = {}
                matched_abbrs = committee_abbr_regex.findall(action.action)
                if 'Com. on' in action.action and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    raise ValueError(action.action)
                if matched_abbrs:

                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)
                        else:
                            committees.append(name)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(committees) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)

                changed = False
                for string in ['upper', 'lower', 'joint']:
                    if actor.startswith(string):
                        actor = string
                        changed = True
                        break
                if not changed:
                    actor = 'other'
                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = '(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                fsbill.add_action(actor,
                                  act_str,
                                  action.action_date.date(),
                                  type_=type_,
                                  **kwargs)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                motion = vote.motion.motion_text or ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              self._tz.localize(vote.vote_date_time),
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type_=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                for s in ('yes', 'no', 'other'):
                    # Kill dupe votes.
                    key = s + '_votes'
                    fsvote[key] = list(set(fsvote[key]))

                # In a small percentage of bills, the integer vote counts
                # are inaccurate, so let's ignore them.
                for k in ('yes', 'no', 'other'):
                    fsvote[k + '_count'] = len(fsvote[k + '_votes'])

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)