def _fix_house_text(self, filename):
        '''
        TLDR: throw out bad text, replace it using different parser
        settings.

        When using `pdftotext` on the 2015 House committee list,
        the second and third columns of the second page get mixed up,
        which makes it very difficult to parse. Adding the `--layout`
        option fixes this, but isn't worth switching all parsing to
        that since the standard `pdftotext --nolayout` is easier in all
        other cases.

        The best solution to this is to throw out the offending text,
        and replace it with the correct text. The third and fourth
        columns are joint comittees that are scraped from the Senate
        document, so the only column that needs to be inserted this way
        is the second.
        '''

        # Take the usable text from the normally-working parsing settings
        text = convert_pdf(filename, type='text-nolayout')
        assert "Revised: January 23, 2015" in text,\
            "House committee list has changed; check that the special-case"\
            " fix is still necessary, and that the result is still correct"
        text = re.sub(r'(?sm)Appropriations/F&C.*$', "", text)

        # Take the usable column from the alternate parser
        alternate_text = convert_pdf(filename, type='text')
        alternate_lines = alternate_text.split('\n')

        HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.)      "
        (text_of_line_to_replace, ) = [
            x for x in alternate_lines
            if HEADER_OF_COLUMN_TO_REPLACE in x
        ]
        first_line_to_replace = alternate_lines.index(text_of_line_to_replace)
        first_character_to_replace = alternate_lines[
            first_line_to_replace].index(HEADER_OF_COLUMN_TO_REPLACE) - 1
        last_character_to_replace = (first_character_to_replace +
                                     len(HEADER_OF_COLUMN_TO_REPLACE))

        column_lines_to_add = [
            x[first_character_to_replace:last_character_to_replace]
            for x in alternate_lines[first_line_to_replace + 1:]
        ]
        column_text_to_add = '\n'.join(column_lines_to_add)

        text = text + column_text_to_add
        return text
Example #2
0
    def _fix_house_text(self, filename):
        '''
        TLDR: throw out bad text, replace it using different parser
        settings.

        When using `pdftotext` on the 2015 House committee list,
        the second and third columns of the second page get mixed up,
        which makes it very difficult to parse. Adding the `--layout`
        option fixes this, but isn't worth switching all parsing to
        that since the standard `pdftotext --nolayout` is easier in all
        other cases.

        The best solution to this is to throw out the offending text,
        and replace it with the correct text. The third and fourth
        columns are joint comittees that are scraped from the Senate
        document, so the only column that needs to be inserted this way
        is the second.
        '''

        # Take the usable text from the normally-working parsing settings
        text = convert_pdf(filename, type='text-nolayout')
        assert "Revised: January 23, 2015" in text,\
            "House committee list has changed; check that the special-case"\
            " fix is still necessary, and that the result is still correct"
        text = re.sub(r'(?sm)Appropriations/F&C.*$', "", text)

        # Take the usable column from the alternate parser
        alternate_text = convert_pdf(filename, type='text')
        alternate_lines = alternate_text.split('\n')

        HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.)      "
        (text_of_line_to_replace, ) = [
            x for x in alternate_lines if HEADER_OF_COLUMN_TO_REPLACE in x
        ]
        first_line_to_replace = alternate_lines.index(text_of_line_to_replace)
        first_character_to_replace = alternate_lines[
            first_line_to_replace].index(HEADER_OF_COLUMN_TO_REPLACE) - 1
        last_character_to_replace = (first_character_to_replace +
                                     len(HEADER_OF_COLUMN_TO_REPLACE))

        column_lines_to_add = [
            x[first_character_to_replace:last_character_to_replace]
            for x in alternate_lines[first_line_to_replace + 1:]
        ]
        column_text_to_add = '\n'.join(column_lines_to_add)

        text = text + column_text_to_add
        return text
Example #3
0
    def scrape_senate_vote(self, bill, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, "text")
        os.remove(path)

        lines = text.split("\n")

        date_match = re.search(r"Date:\s+(\d+/\d+/\d+)", text)
        if not date_match:
            self.log("Couldn't find date on %s" % url)
            return

        time_match = re.search(r"Time:\s+(\d+:\d+:\d+)\s+(AM|PM)", text)
        date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2))
        date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p")
        date = self._tz.localize(date)

        vote_type = None
        yes_count, no_count, other_count = None, None, 0
        votes = []
        for line in lines[21:]:
            line = line.strip()
            if not line:
                continue

            if line.startswith("YEAS"):
                yes_count = int(line.split(" - ")[1])
                vote_type = "yes"
            elif line.startswith("NAYS"):
                no_count = int(line.split(" - ")[1])
                vote_type = "no"
            elif line.startswith("EXCUSED") or line.startswith("NOT VOTING"):
                other_count += int(line.split(" - ")[1])
                vote_type = "other"
            else:
                votes.extend([(n.strip(), vote_type) for n in re.split(r"\s{2,}", line)])

        if yes_count is None or no_count is None:
            self.log("Couldne't find vote counts in %s" % url)
            return

        passed = yes_count > no_count + other_count

        clean_bill_id = fix_bill_id(bill["bill_id"])
        motion_line = None
        for i, line in enumerate(lines):
            if line.strip() == clean_bill_id:
                motion_line = i + 2
        motion = lines[motion_line]
        if not motion:
            self.log("Couldn't find motion for %s" % url)
            return

        vote = Vote("upper", date, motion, passed, yes_count, no_count, other_count)
        vote.add_source(url)

        insert_specific_votes(vote, votes)
        check_vote_counts(vote)

        bill.add_vote(vote)
Example #4
0
    def scrape_rollcall(self, vote, vurl):
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, 'text')
        os.remove(path)

        current_vfunc = None

        for line in pdflines.split('\n'):
            line = line.strip()

            # change what is being recorded
            if line.startswith('YEAS') or line.startswith('AYES'):
                current_vfunc = vote.yes
            elif line.startswith('NAYS'):
                current_vfunc = vote.no
            elif (line.startswith('EXCUSED') or
                  line.startswith('NOT VOTING') or
                  line.startswith('ABSTAIN')):
                current_vfunc = vote.other
            # skip these
            elif not line or line.startswith('Page '):
                continue

            # if a vfunc is active
            elif current_vfunc:
                # split names apart by 3 or more spaces
                names = re.split('\s{3,}', line)
                for name in names:
                    if name:
                        current_vfunc(name.strip())
Example #5
0
    def scrape_upper_committee(self,url):
        filename, resp = self.urlretrieve(url)
        root = lxml.etree.fromstring( convert_pdf(filename,'xml'))
        for link in root.xpath('/pdf2xml/page'):
            comm = None
            for line in link.findall('text'):
                text = line.findtext('b')
                if text is not None and text.startswith('Comisi'):
                    comm = Committee('upper',text);
                    comm.add_source(url)
                else:
                    if line.text and line.text.startswith('Hon.'):
                        line_text = line.text.replace(u'–','-')
                        name_split = line_text.split(u'-',1)
                        title = 'member'
#           print name_split
                        if len(name_split) >= 2:
                            name_split[1] = name_split[1].strip()
                            if name_split[1] == 'Presidenta' or name_split[1] == 'Presidente':
                                title = 'chairman'
                            elif name_split[1] == 'Vicepresidente' or name_split[1] == 'Vicepresidenta':
                                title = 'vicechairman'
                            elif name_split[1] == 'Secretaria' or name_split[1] == 'Secretario':
                                title = 'secretary'
#           if title != 'member':
#               print name_split[0]
                        if name_split[0] != 'VACANTE':
                            comm.add_member(name_split[0].replace('Hon.',''),title)
            self.save_committee(comm)
                        
        
        os.remove(filename);
Example #6
0
    def parse_subjects(self, url, chamber_letter):
        try:
            pdf, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("could not fetch subject index %s" % url)
            return
        lines = convert_pdf(pdf, 'text-nolayout').splitlines()
        os.remove(pdf)

        last_line = ''

        subject_re = re.compile('^[A-Z ]+$')
        bill_re = re.compile('(?:S|H)[A-Z]{1,2} \d+')

        for line in lines[1:]:
            if 'BILL INDEX' in line:
                pass
            elif subject_re.match(line):
                if subject_re.match(last_line):
                    title += ' %s' % line
                elif last_line == '':
                    title = line
            else:
                last_was_upper = False
                for bill_id in bill_re.findall(line):
                    if bill_id.startswith(chamber_letter):
                        if bill_id not in self.all_bills:
                            self.warning("unknown bill %s" % bill_id)
                            continue
                        self.all_bills[bill_id].setdefault('subjects',
                                                           []).append(title)
            # sometimes we need to look back
            last_line = line
Example #7
0
    def add_house_votes(self, vote, filename):
        vcount_re = re.compile(
            'AYES.* (\d+).*NAYS.* (\d+).*NOT VOTING.* (\d+).* PAIRED.*(\d+)')
        xml = convert_pdf(filename, 'xml')
        doc = lxml.html.fromstring(xml)  # use lxml.html for text_content()

        # function to call on next legislator name
        vfunc = None
        name = ''

        for textitem in doc.xpath('//text/text()'):
            if textitem.startswith('AYES'):
                ayes, nays, nv, paired = vcount_re.match(textitem).groups()
                vote['yes_count'] = int(ayes)
                vote['no_count'] = int(nays)
                vote['other_count'] = int(nv) + int(paired)
            elif textitem == 'N':
                vfunc = vote.no
                name = ''
            elif textitem == 'Y':
                vfunc = vote.yes
                name = ''
            elif textitem == 'x':
                vfunc = vote.other
                name = ''
            elif textitem in ('R', 'D', 'I'):
                vfunc(name)
            else:
                if name:
                    name += ' ' + textitem
                else:
                    name = textitem
Example #8
0
    def __init__(self, url, resp):

        self.url = url

        # Fetch the document and put it into tempfile.
        fd, filename = tempfile.mkstemp()

        with open(filename, "wb") as f:
            f.write(resp)

        # Convert it to text.
        try:
            text = convert_pdf(filename, type="text")
        except:
            msg = "couldn't convert pdf."
            raise PDFCommitteeVoteParseError(msg)

        # Get rid of the temp file.
        os.close(fd)
        os.remove(filename)

        if not text.strip():
            msg = "PDF file was empty."
            raise PDFCommitteeVoteParseError(msg)

        self.text = "\n".join(filter(None, text.splitlines()))
Example #9
0
    def add_house_votes(self, vote, filename):
        vcount_re = re.compile('AYES.* (\d+).*NAYS.* (\d+).*NOT VOTING.* (\d+).* PAIRED.*(\d+)')
        xml = convert_pdf(filename, 'xml')
        doc = lxml.html.fromstring(xml)  # use lxml.html for text_content()

        # function to call on next legislator name
        vfunc = None
        name = ''

        for textitem in doc.xpath('//text/text()'):
            if textitem.startswith('AYES'):
                ayes, nays, nv, paired = vcount_re.match(textitem).groups()
                vote['yes_count'] = int(ayes)
                vote['no_count'] = int(nays)
                vote['other_count'] = int(nv)+int(paired)
            elif textitem == 'N':
                vfunc = vote.no
                name = ''
            elif textitem == 'Y':
                vfunc = vote.yes
                name = ''
            elif textitem == 'x':
                vfunc = vote.other
                name = ''
            elif textitem in ('R', 'D', 'I'):
                vfunc(name)
            else:
                if name:
                    name += ' ' + textitem
                else:
                    name = textitem
Example #10
0
    def parse_subjects(self, url, chamber_letter):
        try:
            pdf, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("could not fetch subject index %s" % url)
            return
        lines = convert_pdf(pdf, 'text-nolayout').splitlines()
        os.remove(pdf)

        last_line = ''

        subject_re = re.compile('^[A-Z ]+$')
        bill_re = re.compile('(?:S|H)[A-Z]{1,2} \d+')

        for line in lines[1:]:
            if 'BILL INDEX' in line:
                pass
            elif subject_re.match(line):
                if subject_re.match(last_line):
                    title += ' %s' % line
                elif last_line == '':
                    title = line
            else:
                last_was_upper = False
                for bill_id in bill_re.findall(line):
                    if bill_id.startswith(chamber_letter):
                        if bill_id not in self.all_bills:
                            self.warning("unknown bill %s" % bill_id)
                            continue
                        self.all_bills[bill_id].setdefault('subjects',
                                                           []).append(title)
            # sometimes we need to look back
            last_line = line
Example #11
0
    def scrape_vote(self, session, rollcall_number):

        # Fetch this piece of garbage.
        url = (
            'http://www.mass.gov/legis/journal/RollCallPdfs/'
            '{session}/{rollcall}.pdf?Session={session}&RollCall={rollcall}')
        url_args = dict(
            session=re.findall(r'\d+', session).pop(),
            rollcall=str(rollcall_number).zfill(5))
        url = url.format(**url_args)

        try:
            vote_file, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            # We'll hit a 404 at the end of the votes.
            self.warning('Stopping; encountered a 404 at %s' % url)
            raise self.EndOfHouseVotes

        text = convert_pdf(vote_file, type='text')
        text = text.decode('utf8')

        # A hack to guess whether this PDF has embedded images or contains
        # machine readable text.
        if len(re.findall(r'[YNPX]', text)) > 157:
            vote = self.house_get_vote(text, vote_file, session)
        else:
            vote = self.house_get_vote_with_images(text, vote_file, session)
            self.house_add_votes_from_image(vote_file, vote)

        vote.add_source(url)
        if not self.house_check_vote(vote):
            self.logger.warning('Bad vote counts for %s' % vote)
            return
        self.save_vote(vote)
        os.remove(vote_file)
Example #12
0
    def scrape_vote(self, session, rollcall_number):

        # Fetch this piece of garbage.
        url = (
            'http://www.mass.gov/legis/journal/RollCallPdfs/'
            '{session}/{rollcall}.pdf?Session={session}&RollCall={rollcall}')
        url_args = dict(session=re.findall(r'\d+', session).pop(),
                        rollcall=str(rollcall_number).zfill(5))
        url = url.format(**url_args)

        try:
            vote_file, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            # We'll hit a 404 at the end of the votes.
            raise self.EndOfHouseVotes

        text = convert_pdf(vote_file, type='text')
        text = text.decode('utf8')

        # A hack to guess whether this PDF has embedded images or contains
        # machine readable text.
        if len(re.findall(r'[YNPX]', text)) > 157:
            vote = self.house_get_vote(text, vote_file, session)
        else:
            vote = self.house_get_vote_with_images(text, vote_file, session)
            self.house_add_votes_from_image(vote_file, vote)

        vote.add_source(url)
        if not self.house_check_vote(vote):
            self.logger.warning('Bad vote counts for %s' % vote)
            return
        self.save_vote(vote)
        os.remove(vote_file)
Example #13
0
 def fetch_pdf_lines(self, href):
     # download the file
     fname, resp = self.urlretrieve(href)
     pdflines = [line.decode('utf-8')
                 for line in convert_pdf(fname, 'text').splitlines()]
     os.remove(fname)
     return pdflines
Example #14
0
    def scrape_rollcall(self, vote, vurl):
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, 'text')
        os.remove(path)

        current_vfunc = None

        for line in pdflines.split('\n'):
            line = line.strip()

            # change what is being recorded
            if line.startswith('YEAS') or line.startswith('AYES'):
                current_vfunc = vote.yes
            elif line.startswith('NAYS'):
                current_vfunc = vote.no
            elif (line.startswith('EXCUSED') or line.startswith('NOT VOTING')
                  or line.startswith('ABSTAIN')):
                current_vfunc = vote.other
            # skip these
            elif not line or line.startswith('Page '):
                continue

            # if a vfunc is active
            elif current_vfunc:
                # split names apart by 3 or more spaces
                names = re.split('\s{3,}', line)
                for name in names:
                    if name:
                        current_vfunc(name.strip())
Example #15
0
    def __init__(self, url, resp):

        self.url = url

        # Fetch the document and put it into tempfile.
        fd, filename = tempfile.mkstemp()

        with open(filename, 'wb') as f:
            f.write(resp)

        # Convert it to text.
        try:
            text = convert_pdf(filename, type='text')
        except:
            msg = "couldn't convert pdf."
            raise PDFCommitteeVoteParseError(msg)

        # Get rid of the temp file.
        os.close(fd)
        os.remove(filename)

        if not text.strip():
            msg = 'PDF file was empty.'
            raise PDFCommitteeVoteParseError(msg)

        self.text = '\n'.join(filter(None, text.splitlines()))
Example #16
0
    def extract_rollcall_from_pdf(self,chamber,vote, bill, url,bill_id):
        billnum = re.search("(\d+)", bill_id).group(1)
        self.debug("Scraping rollcall %s|%s|" % (billnum, url))

        bill_prefix = "vote_%s_%s_"  % (chamber, re.sub(r'\s+', '_', bill_id ))

        bill.add_source(url)
        #billnum = re.search("(\d+)", bill_id).group(1)

        # Save roll call pdf to a local file
        temp_file = tempfile.NamedTemporaryFile(delete=False,suffix='.pdf',
                                                prefix=bill_prefix )
        pdf_temp_name = temp_file.name

        self.debug("Parsing pdf votes, saving to tempfile [%s]" %
                   temp_file.name)
        with self.urlopen(url) as pdata:
            pdf_file = file(pdf_temp_name, 'w')
            pdf_file.write(pdata)
            pdf_file.close()

        # Pdf is in pdf_temp_name
        rollcall_data  = convert_pdf(pdf_temp_name, type='text')
        (valid_data, expected, areas, yays, nays, other) = self.count_votes(url,chamber,bill_id,rollcall_data)

        os.unlink(pdf_temp_name)

        if valid_data:
            self.debug("VOTE %s %s yays %d nays %d other %d pdf=%s" %
                       (bill_id, chamber, len(yays), len(nays), len(other),
                        pdf_temp_name ))
            [vote.yes(legislator) for legislator in yays]
            [vote.no(legislator) for legislator in nays]
            [vote.other(legislator) for legislator in other]
Example #17
0
    def scrape_upper_committee(self, url):
        filename, resp = self.urlretrieve(url)
        root = lxml.etree.fromstring(convert_pdf(filename, 'xml'))
        for link in root.xpath('/pdf2xml/page'):
            comm = None
            for line in link.findall('text'):
                text = line.findtext('b')
                if text is not None and text.startswith('Comisi'):
                    comm = Committee('upper', text)
                    comm.add_source(url)
                else:
                    if line.text and line.text.startswith('Hon.'):
                        line_text = line.text.replace(u'–', '-')
                        name_split = line_text.split(u'-', 1)
                        title = 'member'
                        #           print name_split
                        if len(name_split) >= 2:
                            name_split[1] = name_split[1].strip()
                            if name_split[1] == 'Presidenta' or name_split[
                                    1] == 'Presidente':
                                title = 'chairman'
                            elif name_split[
                                    1] == 'Vicepresidente' or name_split[
                                        1] == 'Vicepresidenta':
                                title = 'vicechairman'
                            elif name_split[1] == 'Secretaria' or name_split[
                                    1] == 'Secretario':
                                title = 'secretary'
#           if title != 'member':
#               print name_split[0]
                        if name_split[0] != 'VACANTE':
                            comm.add_member(name_split[0].replace('Hon.', ''),
                                            title)
            self.save_committee(comm)
        os.remove(filename)
Example #18
0
 def scrape_joint_committee(self, url):
     filename, resp = self.urlretrieve(url)
     root = lxml.etree.fromstring(convert_pdf(filename, 'xml'))
     for link in root.xpath('/pdf2xml/page'):
         comm = None
         self.log(lxml.etree.tostring(root))
         return
Example #19
0
    def parse_senate_vote(self, url):
        vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0)
        vote.add_source(url)

        fname, resp = self.urlretrieve(url)
        sv_text = convert_sv_text(convert_pdf(fname, 'text'))
        os.remove(fname)
        in_votes = False

        for line in sv_text:
            if not in_votes:
                dmatch = re.search('DATE:(\d{2}-\d{2}-\d{2})', line)
                if dmatch:
                    date = dmatch.groups()[0]
                    vote['date'] =  datetime.strptime(date, '%m-%d-%y')

                if 'YES NO ABS EXC' in line:
                    in_votes = True
                elif 'PASSED' in line:
                    vote['passed'] = True

            else:
                if 'TOTALS' in line:

                    # Lt. Governor voted
                    if 'GOVERNOR' in line:
                        name, spaces, line = re.match(' ([A-Z,.]+)(\s+)X(.*)',
                                                      line).groups()
                        if len(spaces) == 1:
                            vote.yes(name)
                        else:
                            vote.no(name)

                    _, yes, no, abs, exc = line.split()
                    vote['yes_count'] = int(yes)
                    vote['no_count'] = int(no)
                    vote['other_count'] = int(abs)+int(exc)
                    # no longer in votes
                    in_votes = False
                    continue

                # pull votes out
                matches = re.match(' ([A-Z,.]+)(\s+)X\s+([A-Z,.]+)(\s+)X', line).groups()
                name1, spaces1, name2, spaces2 = matches

                # vote can be determined by # of spaces
                if len(spaces1) == 1:
                    vote.yes(name1)
                elif len(spaces1) == 2:
                    vote.no(name1)
                else:
                    vote.other(name1)

                if len(spaces2) == 1:
                    vote.yes(name2)
                elif len(spaces2) == 2:
                    vote.no(name2)
                else:
                    vote.other(name2)
        return vote
Example #20
0
 def scrape_joint_committee(self,url):
     filename, resp = self.urlretrieve(url)
     root = lxml.etree.fromstring(convert_pdf(filename,'xml'))
     for link in root.xpath('/pdf2xml/page'):
         comm = None
         self.log(lxml.etree.tostring(root))
         return
Example #21
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        try:
            motion = text.split('\n')[4].strip()
        except IndexError:
            return

        try:
            yes_count = int(re.search(r'Yeas - (\d+)', text).group(1))
        except AttributeError:
            return

        no_count = int(re.search(r'Nays - (\d+)', text).group(1))
        other_count = int(re.search(r'Not Voting - (\d+)', text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        y,n,o = 0,0,0
        break_outter = False

        for line in text.split('\n')[9:]:
            if break_outter:
                break

            if 'after roll call' in line:
                break
            if 'Indication of Vote' in line:
                break
            if 'Presiding' in line:
                continue

            for col in re.split(r'-\d+', line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col)

                if match:
                    if match.group(2) == "PAIR":
                        break_outter = True
                        break
                    if match.group(1) == 'Y':
                        vote.yes(match.group(2))
                    elif match.group(1) == 'N':
                        vote.no(match.group(2))
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Example #22
0
    def scrape_house(self, vote, vurl, supplement):
        #Point to PDF and read to memory
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, 'text')
        os.remove(path)
        pdflines = pdflines.decode('utf-8').replace(u'\u2019', "'")

        # get pdf data from supplement number
        try:
            vote_text = pdflines.split('No. ' + str(supplement))[1].split('MASSACHUSETTS')[0]
        except IndexError:
            self.info("No vote found in supplement for vote #%s" % supplement)
            return


        # create list of independant items in vote_text
        rows = vote_text.splitlines()
        lines = []
        for row in rows:
            lines.extend(row.split('   '))

        # retrieving votes in columns
        vote_tally = []
        voters = []
        for line in lines:
            # removes whitespace and after-vote '*' tag
            line = line.strip().strip('*').strip()

            if 'NAYS' in line or 'YEAS' in line or '=' in line or '/' in line:
                continue
            elif line == '':
                continue
            elif line == 'N':
                vote_tally.append('n')
            elif line == 'Y':
                vote_tally.append('y')
            # Not Voting
            elif line == 'X':
                vote_tally.append('x')
            #Present
            elif line == 'P':
                vote_tally.append('p')

            #True for all records 2009 - 2017 - brittle code, this will change in the future
            #elif line == 'Mr. Speaker':
            #    voters.append('DeLeo')
            else:
                voters.append(line)

        house_votes = list(zip(voters, vote_tally))
        # iterate list and add individual names to vote.yes, vote.no
        for tup1 in house_votes:
            if tup1[1] == 'y':
                vote.yes(tup1[0])
            elif tup1[1] == 'n':
                vote.no(tup1[0])
            else:
                vote.other(tup1[0])
Example #23
0
    def scrape_house(self, vote, vurl, supplement):
        #Point to PDF and read to memory
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, 'text')
        os.remove(path)
        pdflines = pdflines.decode('utf-8').replace(u'\u2019', "'")

        # get pdf data from supplement number
        try:
            vote_text = pdflines.split('No. ' + str(supplement))[1].split(
                'MASSACHUSETTS')[0]
        except IndexError:
            self.info("No vote found in supplement for vote #%s" % supplement)
            return

        # create list of independant items in vote_text
        rows = vote_text.splitlines()
        lines = []
        for row in rows:
            lines.extend(row.split('   '))

        # retrieving votes in columns
        vote_tally = []
        voters = []
        for line in lines:
            # removes whitespace and after-vote '*' tag
            line = line.strip().strip('*').strip()

            if 'NAYS' in line or 'YEAS' in line or '=' in line or '/' in line:
                continue
            elif line == '':
                continue
            elif line == 'N':
                vote_tally.append('n')
            elif line == 'Y':
                vote_tally.append('y')
            # Not Voting
            elif line == 'X':
                vote_tally.append('x')
            #Present
            elif line == 'P':
                vote_tally.append('p')

            #True for all records 2009 - 2017 - brittle code, this will change in the future
            #elif line == 'Mr. Speaker':
            #    voters.append('DeLeo')
            else:
                voters.append(line)

        house_votes = list(zip(voters, vote_tally))
        # iterate list and add individual names to vote.yes, vote.no
        for tup1 in house_votes:
            if tup1[1] == 'y':
                vote.yes(tup1[0])
            elif tup1[1] == 'n':
                vote.no(tup1[0])
            else:
                vote.other(tup1[0])
Example #24
0
def ca_handler(filedata, metadata):
    if file.endswith('.pdf'):
        # NOTE: this strips the summary, it'd be useful for search (but not SFM)
        lines = convert_pdf(file, 'text').splitlines()
        return text_after_line_numbers(lines)
    elif file.endswith('.html'):
        doc = lxml.html.fromstring(open(file).read())
        text = doc.xpath('//pre')[0].text_content()
        return collapse_spaces(text)
Example #25
0
 def text(self):
     text = getattr(self, '_text', None)
     if text:
         return text
     (path, resp) = self.scraper.urlretrieve(self.url)
     text = convert_pdf(path, 'text')
     os.remove(path)
     self._text = text
     return text
Example #26
0
 def text(self):
     text = getattr(self, '_text', None)
     if text:
         return text
     (path, resp) = self.scraper.urlretrieve(self.url)
     text = convert_pdf(path, 'text')
     os.remove(path)
     self._text = text
     return text
Example #27
0
def ca_handler(filedata, metadata):
    if file.endswith('.pdf'):
        # NOTE: this strips the summary, it'd be useful for search (but not SFM)
        lines = convert_pdf(file, 'text').splitlines()
        return text_after_line_numbers(lines)
    elif file.endswith('.html'):
        doc = lxml.html.fromstring(open(file).read())
        text = doc.xpath('//pre')[0].text_content()
        return collapse_spaces(text)
Example #28
0
 def fetch_pdf_lines(self, href):
     # download the file
     fname, resp = self.urlretrieve(href)
     pdflines = [
         line.decode('utf-8')
         for line in convert_pdf(fname, 'text').splitlines()
     ]
     os.remove(fname)
     return pdflines
Example #29
0
 def scrape_vote(self, url, local=False):
     """Retrieves or uses local copy of vote pdf and converts into XML."""
     if not local:
         try:
             url, resp = self.urlretrieve(url)
         except scrapelib.HTTPError:
             self.warning("Request failed: {}".format(url))
             return
     v_text = convert_pdf(url, 'xml')
     os.remove(url)
     return v_text
Example #30
0
 def scrape_vote(self, url, local=False):
     """Retrieves or uses local copy of vote pdf and converts into XML."""
     if not local:
         try:
             url, resp = self.urlretrieve(url)
         except scrapelib.HTTPError:
             self.warning("Request failed: {}".format(url))
             return
     v_text = convert_pdf(url, 'xml')
     os.remove(url)
     return v_text
Example #31
0
    def scrape_votes(self, bill, votes_url):
        html = self.urlopen(votes_url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(votes_url)

        EXPECTED_VOTE_CODES = ['Y','N','E','NV','A','P','-']

        # vote indicator, a few spaces, a name, newline or multiple spaces
        VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})')

        for link in doc.xpath('//a[contains(@href, "votehistory")]'):

            pieces = link.text.split(' - ')
            date = pieces[-1]
            if len(pieces) == 3:
                motion = pieces[1]
            else:
                motion = 'Third Reading'

            chamber = link.xpath('../following-sibling::td/text()')[0]
            if chamber == 'HOUSE':
                chamber = 'lower'
            elif chamber == 'SENATE':
                chamber = 'upper'
            else:
                self.warning('unknown chamber %s' % chamber)

            date = datetime.datetime.strptime(date, "%A, %B %d, %Y")

            # download the file
            fname, resp = self.urlretrieve(link.get('href'))
            pdflines = convert_pdf(fname, 'text').splitlines()
            os.remove(fname)

            vote = Vote(chamber, date, motion.strip(), False, 0, 0, 0)

            for line in pdflines:
                for match in VOTE_RE.findall(line):
                    vcode, name = match
                    if vcode == 'Y':
                        vote.yes(name)
                    elif vcode == 'N':
                        vote.no(name)
                    else:
                        vote.other(name)

            # fake the counts
            vote['yes_count'] = len(vote['yes_votes'])
            vote['no_count'] = len(vote['no_votes'])
            vote['other_count'] = len(vote['other_votes'])
            vote['passed'] = vote['yes_count'] > vote['no_count']
            vote.add_source(link.get('href'))

            bill.add_vote(vote)
Example #32
0
 def fetch_pdf_lines(self, href):
     # download the file
     try:
         fname, resp = self.urlretrieve(href)
         pdflines = [line.decode('utf-8') for line in convert_pdf(fname, 'text').splitlines()]
         os.remove(fname)
         return pdflines
     except scrapelib.HTTPError as e:
         assert '404' in e.args[0], "File not found: {}".format(e)
         self.warning("404 error for vote; skipping vote")
         return False
Example #33
0
 def text(self):
     text = getattr(self, "_text", None)
     if text:
         return text
     try:
         (path, resp) = self.scraper.urlretrieve(self.url)
     except scrapelib.HttpError as exc:
         self.scraper.warning("Got error %r while fetching %s" % (exc, self.url))
         raise self.VoteParseError()
     text = convert_pdf(path, "text")
     os.remove(path)
     self._text = text
     return text
Example #34
0
    def scrape_senate(self, vote, vurl):
        # download file to server
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, 'text')
        os.remove(path)

        # for y, n
        mode = None

        lines = pdflines.splitlines()

        # handle individual lines in pdf to id legislator votes
        for line in lines:
            line = line.strip()
            line = line.decode('utf-8').replace(u'\u2212', '-')
            if line == '':
                continue
            # change mode accordingly
            elif line.startswith('YEAS'):
                mode = 'y'
            elif line.startswith('NAYS'):
                mode = 'n'
            elif line.startswith('ABSENT OR'):
                mode = 'o'
            # else parse line with names
            else:
                nameline = line.split('   ')

                for raw_name in nameline:
                    raw_name = raw_name.strip()
                    if raw_name == '':
                        continue

                    # handles vote count lines
                    cut_name = raw_name.split('-')
                    clean_name = ''
                    if cut_name[len(cut_name) - 1].strip(' .').isdigit():
                        del cut_name[-1]
                        clean_name = ''.join(cut_name)
                    else:
                        clean_name = raw_name.strip()
                    # update vote object with names
                    if mode == 'y':
                        vote.yes(clean_name)
                    elif mode == 'n':
                        vote.no(clean_name)
                    elif mode == 'o':
                        vote.other(clean_name)
Example #35
0
    def scrape_senate(self, vote, vurl):
        # download file to server
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, 'text')
        os.remove(path)

        # for y, n
        mode = None

        lines = pdflines.splitlines()

        # handle individual lines in pdf to id legislator votes
        for line in lines:
            line = line.strip()
            line = line.decode('utf-8').replace(u'\u2212', '-')
            if line == '':
                continue
            # change mode accordingly
            elif line.startswith('YEAS'):
                mode = 'y'
            elif line.startswith('NAYS'):
                mode = 'n'
            elif line.startswith('ABSENT OR'):
                mode = 'o'
            # else parse line with names
            else:
                nameline = line.split('   ')

                for raw_name in nameline:
                    raw_name = raw_name.strip()
                    if raw_name == '':
                        continue

                    # handles vote count lines
                    cut_name = raw_name.split('-')
                    clean_name = ''
                    if cut_name[len(cut_name) - 1].strip(' .').isdigit():
                        del cut_name[-1]
                        clean_name = ''.join(cut_name)
                    else:
                        clean_name = raw_name.strip()
                    # update vote object with names
                    if mode == 'y':
                        vote.yes(clean_name)
                    elif mode == 'n':
                        vote.no(clean_name)
                    elif mode == 'o':
                        vote.other(clean_name)
Example #36
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = Vote('upper',
                    date,
                    'Passage',
                    passed=None,
                    yes_count=0,
                    no_count=0,
                    other_count=0)
        vote.add_source(url)

        text = convert_pdf(filename, 'text')
        os.remove(filename)

        if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text):
            return self.scrape_senate_vote_3col(bill, vote, text, url, date)

        data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1]
        data = filter(None, data)
        keymap = dict(yea='yes', nay='no')
        actual_vote = collections.defaultdict(int)
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), 'other')
            values = data.pop()
            for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values):
                if name.lower().strip() == 'none.':
                    continue
                name = name.replace('..', '')
                name = re.sub(r'\.$', '', name)
                name = name.strip('-1234567890 \n')
                if not name:
                    continue
                getattr(vote, key)(name)
                actual_vote[vote_val] += 1
                vote[key + '_count'] += 1
            assert actual_vote[vote_val] == vote[key + '_count']

        vote['passed'] = vote['no_count'] < vote['yes_count']
        bill.add_vote(vote)
Example #37
0
    def parse_house_vote(self, url):
        """ house votes are pdfs that can be converted to text, require some
        nasty regex to get votes out reliably """

        fname, resp = self.urlretrieve(url)
        text = convert_pdf(fname, 'text')
        if not text.strip():
            self.warning('image PDF %s' % url)
            return
        os.remove(fname)

        # get date
        if text.strip() == 'NEW MEXICO HOUSE OF REPRESENTATIVES':
            self.warning("What the heck: %s" % (url))
            return None

        date = re.findall('(\d+/\d+/\d+)', text)[0]
        date = datetime.strptime(date, '%m/%d/%Y')

        # get totals
        yea, nay, exc, absent = self.HOUSE_TOTAL_RE.findall(text)[0]

        # make vote (faked passage indicator)
        vote = Vote('lower', date, 'house passage',
                    int(yea) > int(nay), int(yea), int(nay),
                    int(absent) + int(exc))
        vote.add_source(url)

        # votes
        real_votes = False
        for v, name in HOUSE_VOTE_RE.findall(text):
            # our regex is a bit broad, wait until we see 'Nays' to start
            # and end when we see CERTIFIED or ____ signature line
            if 'Nays' in name or 'Excused' in name:
                real_votes = True
                continue
            elif 'CERTIFIED' in name or '___' in name:
                break
            elif real_votes and name.strip():
                if v == 'Y':
                    vote.yes(name)
                elif v == 'N':
                    vote.no(name)
                else:  # excused/absent
                    vote.other(name)
        return vote
Example #38
0
    def parse_house_vote(self, url):
        """ house votes are pdfs that can be converted to text, require some
        nasty regex to get votes out reliably """

        fname, resp = self.urlretrieve(url)
        text = convert_pdf(fname, 'text')
        if not text.strip():
            self.warning('image PDF %s' % url)
            return
        os.remove(fname)

        # get date
        if text.strip() == 'NEW MEXICO HOUSE OF REPRESENTATIVES':
            self.warning("What the heck: %s" % (url))
            return None

        date = re.findall('(\d+/\d+/\d+)', text)[0]
        date = datetime.strptime(date, '%m/%d/%Y')

        # get totals
        yea, nay, exc, absent = self.HOUSE_TOTAL_RE.findall(text)[0]

        # make vote (faked passage indicator)
        vote = Vote('lower', date, 'house passage', int(yea) > int(nay),
                    int(yea), int(nay), int(absent) + int(exc))
        vote.add_source(url)

        # votes
        real_votes = False
        for v, name in HOUSE_VOTE_RE.findall(text):
            # our regex is a bit broad, wait until we see 'Nays' to start
            # and end when we see CERTIFIED or ____ signature line
            if 'Nays' in name or 'Excused' in name:
                real_votes = True
                continue
            elif 'CERTIFIED' in name or '___' in name:
                break
            elif real_votes and name.strip():
                if v == 'Y':
                    vote.yes(name)
                elif v == 'N':
                    vote.no(name)
                else:   # excused/absent
                    vote.other(name)
        return vote
Example #39
0
def parse_vote(scraper, chamber, doc_meta):
    # Get the pdf text.
    try:
        (path, resp) = scraper.urlretrieve(doc_meta.url)
    except scrapelib.HTTPError as exc:
        scraper.warning("Got error %r while fetching %s" % (exc, url))
        raise VoteParseError()
    text = convert_pdf(path, "text")
    text = text.replace("\xc2\xa0", " ")
    text = text.replace("\xc2\xad", " ")
    os.remove(path)

    # Figure out what type of vote this is.
    if "Roll Call" in text:
        return RollCallVote(text, scraper, chamber, doc_meta).vote()
    else:
        scraper.warning("Skipping a committee vote (See Jira issue DATA-80).")
        raise VoteParseError()
Example #40
0
def parse_vote(scraper, chamber, doc_meta):
    # Get the pdf text.
    try:
        (path, resp) = scraper.urlretrieve(doc_meta.url)
    except scrapelib.HTTPError as exc:
        scraper.warning('Got error %r while fetching %s' % (exc, url))
        raise VoteParseError()
    text = convert_pdf(path, 'text')
    text = text.replace('\xc2\xa0', ' ')
    text = text.replace('\xc2\xad', ' ')
    os.remove(path)

    # Figure out what type of vote this is.
    if 'Roll Call' in text:
        return RollCallVote(text, scraper, chamber, doc_meta).vote()
    else:
        scraper.warning('Skipping a committee vote (See Jira issue DATA-80).')
        raise VoteParseError()
Example #41
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        motion = text.split('\n')[4].strip()

        yes_count = int(re.search(r'Yeas - (\d+)', text).group(1))
        no_count = int(re.search(r'Nays - (\d+)', text).group(1))
        other_count = int(re.search(r'Not Voting - (\d+)', text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in text.split('\n')[9:]:
            if 'after roll call' in line:
                break
            if 'Indication of Vote' in line:
                break
            if 'Presiding' in line:
                continue

            for col in re.split(r'-\d+', line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col)
                if match:
                    if match.group(1) == 'Y':
                        vote.yes(match.group(2))
                    elif match.group(1) == 'N':
                        vote.no(match.group(2))
                    elif match.group(1) == '*':
                        pass # skip paired voters, don't factor into count
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Example #42
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = Vote('upper', date, 'Passage', passed=None,
                    yes_count=0, no_count=0, other_count=0)
        vote.add_source(url)

        text = convert_pdf(filename, 'text')
        os.remove(filename)

        if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text):
            return self.scrape_senate_vote_3col(bill, vote, text, url, date)

        data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1]
        data = filter(None, data)
        keymap = dict(yea='yes', nay='no')
        actual_vote = collections.defaultdict(int)
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), 'other')
            values = data.pop()
            for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values):
                if name.lower().strip() == 'none.':
                    continue
                name = name.replace('..', '')
                name = re.sub(r'\.$', '', name)
                name = name.strip('-1234567890 \n')
                if not name:
                    continue
                getattr(vote, key)(name)
                actual_vote[vote_val] += 1
                vote[key + '_count'] += 1
            assert actual_vote[vote_val] == vote[key + '_count']

        vote['passed'] = vote['no_count'] < vote['yes_count']
        bill.add_vote(vote)
Example #43
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = Vote("upper", date, "Passage", passed=None, yes_count=0, no_count=0, other_count=0)
        vote.add_source(url)

        text = convert_pdf(filename, "text")
        os.remove(filename)

        if re.search("Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text):
            return self.scrape_senate_vote_3col(bill, vote, text, url, date)

        data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1]
        data = filter(None, data)
        keymap = dict(yea="yes", nay="no")
        actual_vote = collections.defaultdict(int)
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), "other")
            values = data.pop()
            for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values):
                if name.lower().strip() == "none.":
                    continue
                name = name.replace("..", "")
                name = re.sub(r"\.$", "", name)
                name = name.strip("-1234567890 \n")
                if not name:
                    continue
                getattr(vote, key)(name)
                actual_vote[vote_val] += 1
                vote[key + "_count"] += 1
            assert actual_vote[vote_val] == vote[key + "_count"]

        vote["passed"] = vote["no_count"] < vote["yes_count"]
        bill.add_vote(vote)
Example #44
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, "text")
        os.remove(path)

        motion = text.split("\n")[4].strip()

        yes_count = int(re.search(r"Yeas - (\d+)", text).group(1))
        no_count = int(re.search(r"Nays - (\d+)", text).group(1))
        other_count = int(re.search(r"Not Voting - (\d+)", text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count)
        vote.add_source(url)

        for line in text.split("\n")[9:]:
            if "after roll call" in line:
                break
            if "Indication of Vote" in line:
                break
            if "Presiding" in line:
                continue

            for col in re.split(r"-\d+", line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r"(Y|N|EX)\s+(.+)$", col)
                if match:
                    if match.group(1) == "Y":
                        vote.yes(match.group(2))
                    elif match.group(1) == "N":
                        vote.no(match.group(2))
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Example #45
0
    def extract_rollcall_from_pdf(self, chamber, vote, bill, url, bill_id):
        billnum = re.search("(\d+)", bill_id).group(1)
        self.debug("Scraping rollcall %s|%s|" % (billnum, url))

        bill_prefix = "vote_%s_%s_" % (chamber, re.sub(r'\s+', '_', bill_id))

        bill.add_source(url)
        #billnum = re.search("(\d+)", bill_id).group(1)

        # Save roll call pdf to a local file
        temp_file = tempfile.NamedTemporaryFile(delete=False,
                                                suffix='.pdf',
                                                prefix=bill_prefix)
        pdf_temp_name = temp_file.name

        # Save converted text to a local file
        #otemp_file = tempfile.NamedTemporaryFile(delete=False,suffix='.txt',prefix=bill_prefix )
        #txt_temp_name = otemp_file.name

        #self.debug("Parsing pdf votes, saving to tempfile [%s] textfile=[%s]" % (temp_file.name, otemp_file.name))
        self.debug("Parsing pdf votes, saving to tempfile [%s]" %
                   temp_file.name)
        with self.urlopen(url) as pdata:
            pdf_file = file(pdf_temp_name, 'w')
            pdf_file.write(pdata)
            pdf_file.close()

        # Pdf is in pdf_temp_name
        rollcall_data = convert_pdf(pdf_temp_name, type='text')
        (valid_data, expected, areas, yays, nays,
         other) = self.count_votes(url, chamber, bill_id, rollcall_data)

        os.unlink(pdf_temp_name)

        if valid_data:
            self.debug("VOTE %s %s yays %d nays %d other %d pdf=%s" %
                       (bill_id, chamber, len(yays), len(nays), len(other),
                        pdf_temp_name))
            [vote.yes(legislator) for legislator in yays]
            [vote.no(legislator) for legislator in nays]
            [vote.other(legislator) for legislator in other]
Example #46
0
    def parse_house_vote(self, url):
        """ house votes are pdfs that can be converted to text, require some
        nasty regex to get votes out reliably """

        fname, resp = self.urlretrieve(url)
        text = convert_pdf(fname, "text")
        if not text.strip():
            self.warning("image PDF %s" % url)
            return
        os.remove(fname)

        # get date
        date = re.findall("(\d+/\d+/\d+)", text)[0]
        date = datetime.strptime(date, "%m/%d/%y")

        # get totals
        absent, yea, nay, exc = self.HOUSE_TOTAL_RE.findall(text)[0]

        # make vote (faked passage indicator)
        vote = Vote("lower", date, "house passage", int(yea) > int(nay), int(yea), int(nay), int(absent) + int(exc))
        vote.add_source(url)

        # votes
        real_votes = False
        for v, name in HOUSE_VOTE_RE.findall(text):
            # our regex is a bit broad, wait until we see 'Nays' to start
            # and end when we see CERTIFIED or ____ signature line
            if "Nays" in name or "Excused" in name:
                real_votes = True
                continue
            elif "CERTIFIED" in name or "___" in name:
                break
            elif real_votes and name.strip():
                if v == "Y":
                    vote.yes(name)
                elif v == "N":
                    vote.no(name)
                else:  # excused/absent
                    vote.other(name)
        return vote
Example #47
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        motion = text.split('\n')[4].strip()

        yes_count = int(re.search(r'Yeas - (\d+)', text).group(1))
        no_count = int(re.search(r'Nays - (\d+)', text).group(1))
        other_count = int(re.search(r'Not Voting - (\d+)', text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in text.split('\n')[9:]:
            if 'after roll call' in line:
                break
            if 'Presiding' in line:
                continue

            for col in re.split(r'-\d+', line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r'(Y|N|EX)\s+(.+)$', col)
                if match:
                    if match.group(1) == 'Y':
                        vote.yes(match.group(2))
                    elif match.group(1) == 'N':
                        vote.no(match.group(2))
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Example #48
0
def main():
    html = convert_pdf('openstates/ny/scripts/assembly_parties.pdf')
    doc = lxml.html.fromstring(html)
    dems = doc.xpath('//text[@font="4"]/b/text()')
    dems = map(getname, dems)
    repubs = doc.xpath('//text[@font="5"]/i/b/text()')
    repubs = map(getname, repubs)

    name_to_party = {}
    for list_, party in ((dems, 'Democratic'), (repubs, 'Republican')):
        for name in list_:
            print name, 'matched',
            try:
                full_name = difflib.get_close_matches(name, legs).pop(0)
            except IndexError:
                print 'NO MATCH FOUND'
                continue
            print full_name, ':: party = ', party
            name_to_party[full_name] = party
            print party

    # import pprint
    # pprint.pprint(name_to_party)

    print 'party_dict = {'
    it = iter(name_to_party.items())
    while True:

        try:
            # Col 1
            full_name1, party1 = next(it)
            full_name2, party2 = next(it)

            print ('\n    %r: %r,' % (full_name1, party1)).ljust(45),
            print ('%r: %r,' % (full_name2, party2))
        except StopIteration:
            break

    print '    }'
Example #49
0
    def parse_house_vote(self, url):

        fname, resp = self.urlretrieve(url)
        text = convert_pdf(fname, 'text')
        if not text.strip():
            self.warning('image PDF %s' % url)
            return
        os.remove(fname)

        # get date
        date = re.findall('(\d+/\d+/\d+)', text)[0]
        date = datetime.strptime(date, '%m/%d/%y')

        # get totals
        absent, yea, nay, exc = self.HOUSE_TOTAL_RE.findall(text)[0]

        # make vote (faked passage indicator)
        vote = Vote('lower', date, 'house passage', int(yea) > int(nay),
                    int(yea), int(nay), int(absent)+int(exc))
        vote.add_source(url)

        # votes
        real_votes = False
        for v, name in HOUSE_VOTE_RE.findall(text):
            # our regex is a bit broad, wait until we see 'Nays' to start
            # and end when we see CERTIFIED or ____ signature line
            if 'Nays' in name or 'Excused' in name:
                real_votes = True
                continue
            elif 'CERTIFIED' in name or '___' in name:
                break
            elif real_votes and name.strip():
                if v == 'Y':
                    vote.yes(name)
                elif v == 'N':
                    vote.no(name)
                else:   # excused/absent
                    vote.other(name)
        return vote
Example #50
0
    def add_senate_votes(self, vote, filename):
        xml = convert_pdf(filename, 'xml')
        doc = lxml.html.fromstring(xml)  # use lxml.html for text_content()

        # what to do with the pieces
        vfunc = None

        for textitem in doc.xpath('//text'):

            text = textitem.text_content().strip()

            if text.startswith('AYES'):
                vfunc = vote.yes
                vote['yes_count'] = int(text.split(u' \u2212 ')[1])
            elif text.startswith('NAYS'):
                vfunc = vote.no
                vote['no_count'] = int(text.split(u' \u2212 ')[1])
            elif text.startswith('NOT VOTING'):
                vfunc = vote.other
                vote['other_count'] = int(text.split(u' \u2212 ')[1])
            elif text.startswith('SEQUENCE NO'):
                vfunc = None
            elif vfunc:
                vfunc(text)
Example #51
0
    def scrape_house(self, session):
        url = journals % (session, 'House')
        page = self.lxmlize(url)
        hrefs = page.xpath("//font//a")

        for href in hrefs:
            (path, response) = self.urlretrieve(href.attrib['href'])
            data = convert_pdf(path, type='text')

            in_vote = False
            cur_vote = {}
            known_date = None
            cur_vote_count = None
            in_question = False
            cur_question = None
            cur_bill_id = None

            for line in data.split("\n"):
                if known_date is None:
                     dt = date_re.findall(line)
                     if dt != []:
                        dt, dow = dt[0]
                        known_date = datetime.datetime.strptime(dt,
                            "%A, %B %d, %Y")

                non_std = False
                if re.match("(\s+)?\d+.*", line) is None:
                    non_std = True
                    l = line.lower().strip()
                    skip = False
                    blacklist = [
                        "house",
                        "page",
                        "general assembly",
                        "state of colorado",
                        "session",
                        "legislative day"
                    ]
                    for thing in blacklist:
                        if thing in l:
                            skip = True
                    if skip:
                        continue

                found = re.findall(
                    "(?P<bill_id>(H|S|SJ|HJ)(B|M|R)\d{2}-\d{3,4})",
                    line
                )
                if found != []:
                    found = found[0]
                    cur_bill_id, chamber, typ = found

                try:
                    if not non_std:
                        _, line = line.strip().split(" ", 1)
                    line = line.strip()
                except ValueError:
                    in_vote = False
                    in_question = False
                    continue

                if in_question:
                    cur_question += " " + line.strip()
                    continue

                if ("The question being" in line) or \
                   ("On motion of" in line) or \
                   ("the following" in line) or \
                   ("moved that the" in line):
                    cur_question = line.strip()
                    in_question = True


                if in_vote:
                    if line == "":
                        likely_garbage = True

                    likely_garbage = False
                    if "co-sponsor" in line.lower():
                        likely_garbage = True

                    if 'the speaker' in line.lower():
                        likely_garbage = True

                    votes = re.findall(votes_re, line)
                    if likely_garbage:
                        votes = []

                    for person, _, v in votes:
                        cur_vote[person] = v

                    last_line = False
                    for who, _, vote in votes:
                        if who.lower() == "speaker":
                            last_line = True

                    if votes == [] or last_line:
                        in_vote = False
                        # save vote
                        yes, no, other = cur_vote_count
                        if cur_bill_id is None or cur_question is None:
                            continue

                        bc = {
                            "H": "lower",
                            "S": "upper",
                            "J": "joint"
                        }[cur_bill_id[0].upper()]

                        vote = Vote('lower',
                                    known_date,
                                    cur_question,
                                    (yes > no),
                                    yes,
                                    no,
                                    other,
                                    session=session,
                                    bill_id=cur_bill_id,
                                    bill_chamber=bc)

                        vote.add_source(href.attrib['href'])
                        vote.add_source(url)

                        for person in cur_vote:
                            if person is None:
                                continue

                            vot = cur_vote[person]

                            if person.endswith("Y"):
                                vot = "Y"
                                person = person[:-1]
                            if person.endswith("N"):
                                vot = "N"
                                person = person[:-1]
                            if person.endswith("E"):
                                vot = "E"
                                person = person[:-1]

                            if vot == 'Y':
                                vote.yes(person)
                            elif vot == 'N':
                                vote.no(person)
                            elif vot == 'E' or vot == '-':
                                vote.other(person)

                        self.save_vote(vote)

                        cur_vote = {}
                        in_question = False
                        cur_question = None
                        in_vote = False
                        cur_vote_count = None
                        continue

                summ = vote_re.findall(line)
                if summ == []:
                    continue
                summ = summ[0]
                yes, no, exc, ab = summ
                yes, no, exc, ab = \
                        int(yes), int(no), int(exc), int(ab)
                other = exc + ab
                cur_vote_count = (yes, no, other)
                in_vote = True
                continue
            os.unlink(path)
Example #52
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, 'text')
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)

        for idx, line in enumerate(lines):
            line = line.rstrip()
            match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line)
            if match:
                motion = lines[idx - 2].strip()
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()
                ]

                exc_match = re.search(r'EXCUSED: (\d+)', line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith('ADOPTED') or line.endswith('PASSED'):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line)
            if match:
                vote_type = {
                    'YEAS': 'yes',
                    'NAYS': 'no',
                    'NOT VOTING': 'other',
                    'EXCUSED': 'other',
                    'PAIRED': 'paired'
                }[match.group(1)]
                continue

            if vote_type == 'paired':
                for part in line.split('   '):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(r'([^\(]+)\((YEA|NAY)\)',
                                               line).groups()
                    name = name.strip()
                    if pair_type == 'YEA':
                        votes['yes'].append(name)
                    elif pair_type == 'NAY':
                        votes['no'].append(name)
            elif vote_type:
                for name in line.split('   '):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)

        vote = Vote('lower', date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        vote['yes_votes'] = votes['yes']
        vote['no_votes'] = votes['no']
        vote['other_votes'] = votes['other']

        assert len(vote['yes_votes']) == yes_count
        assert len(vote['no_votes']) == no_count
        assert len(vote['other_votes']) == other_count

        bill.add_vote(vote)
Example #53
0
    def scrape_digest(self, bill):
        digest_url = 'http://legisweb.state.wy.us/%(session)s/Digest/%(bill_id)s.pdf' % bill
        bill.add_source(digest_url)

        try:
            (filename, response) = self.urlretrieve(digest_url)
            all_text = convert_pdf(filename, type='text')
        except scrapelib.HTTPError:
            self.warning('no digest for %s' % bill['bill_id'])
            return
        if all_text.strip() == "":
            self.warning('Non-functional digest for bill {}'.format(
                bill['bill_id']))
            return

        # Split the digest's text into sponsors, description, and actions
        SPONSOR_RE = r'(?sm)Sponsored By:\s+(.*?)\n\n'
        DESCRIPTION_RE = r'(?sm)\n\n((?:AN\s*?ACT|A JOINT RESOLUTION) .*?)\n\n'
        ACTIONS_RE = r'(?sm)\n\n(\d{1,2}/\d{1,2}/\d{4}.*)'

        ext_title = re.search(DESCRIPTION_RE, all_text).group(1)
        bill_desc = ext_title.replace('\n', ' ')
        bill_desc = re.sub("  *", " ",
                           bill_desc.decode('utf-8')).encode('utf-8')
        bill['description'] = bill_desc

        sponsor_span = re.search(SPONSOR_RE, all_text).group(1)
        sponsors = ''
        sponsors = sponsor_span.replace('\n', ' ')
        if sponsors:
            if 'Committee' in sponsors:
                bill.add_sponsor('primary', sponsors)
            else:
                if bill['chamber'] == 'lower':
                    sp_lists = sponsors.split('and Senator(s)')
                else:
                    sp_lists = sponsors.split('and Representative(s)')
                for spl in sp_lists:
                    for sponsor in split_names(spl):
                        sponsor = sponsor.strip()
                        if sponsor != "":
                            bill.add_sponsor('primary', sponsor)

        action_re = re.compile('(\d{1,2}/\d{1,2}/\d{4})\s+(H |S )?(.+)')
        vote_total_re = re.compile(
            '(Ayes )?(\d*)(\s*)Nays(\s*)(\d+)(\s*)Excused(\s*)(\d+)(\s*)Absent(\s*)(\d+)(\s*)Conflicts(\s*)(\d+)'
        )

        # initial actor is bill chamber
        actor = bill['chamber']
        actions = []
        action_lines = re.search(ACTIONS_RE, all_text).group(1).split('\n')
        action_lines = iter(action_lines)
        for line in action_lines:
            line = clean_line(line)

            # skip blank lines
            if not line:
                continue

            amatch = action_re.match(line)
            if amatch:
                date, achamber, action = amatch.groups()

                # change actor if one is on this action
                if achamber == 'H ':
                    actor = 'lower'
                elif achamber == 'S ':
                    actor = 'upper'

                date = datetime.datetime.strptime(date, '%m/%d/%Y')
                bill.add_action(actor,
                                action.strip(),
                                date,
                                type=categorize_action(action))
            elif line == 'ROLL CALL':
                voters = defaultdict(str)
                # if we hit a roll call, use an inner loop to consume lines
                # in a psuedo-state machine manner, 3 types
                # Ayes|Nays|Excused|... - indicates next line is voters
                # : (Senators|Representatives): ... - voters
                # \d+ Nays \d+ Excused ... - totals
                voters_type = None
                for ainext in action_lines:
                    nextline = clean_line(ainext)
                    if not nextline:
                        continue

                    breakers = [
                        "Ayes:", "Nays:", "Nayes:", "Excused:", "Absent:",
                        "Conflicts:"
                    ]

                    for breaker in breakers:
                        if nextline.startswith(breaker):
                            voters_type = breaker[:-1]
                            if voters_type == "Nayes":
                                voters_type = "Nays"
                                self.log("Fixed a case of 'Naye-itis'")
                            nextline = nextline[len(breaker) - 1:]

                    if nextline.startswith(': '):
                        voters[voters_type] = nextline
                    elif nextline in ('Ayes', 'Nays', 'Excused', 'Absent',
                                      'Conflicts'):
                        voters_type = nextline
                    elif vote_total_re.match(nextline):
                        #_, ayes, _, nays, _, exc, _, abs, _, con, _ = \
                        tupple = vote_total_re.match(nextline).groups()
                        ayes = tupple[1]
                        nays = tupple[4]
                        exc = tupple[7]
                        abs = tupple[10]
                        con = tupple[13]

                        passed = (('Passed' in action or 'Do Pass' in action
                                   or 'Did Concur' in action
                                   or 'Referred to' in action)
                                  and 'Failed' not in action)
                        vote = Vote(actor, date, action, passed, int(ayes),
                                    int(nays),
                                    int(exc) + int(abs) + int(con))
                        vote.add_source(digest_url)

                        for vtype, voters in voters.iteritems():
                            for voter in split_names(voters):
                                if voter:
                                    if vtype == 'Ayes':
                                        vote.yes(voter)
                                    elif vtype == 'Nays':
                                        vote.no(voter)
                                    else:
                                        vote.other(voter)
                        # done collecting this vote
                        bill.add_vote(vote)
                        break
                    else:
                        # if it is a stray line within the vote, is is a
                        # continuation of the voter list
                        # (sometimes has a newline)
                        voters[voters_type] += ' ' + nextline
Example #54
0
    def scrape_uppper_committee_vote(self, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        (_, motion) = lines[5].split("FINAL ACTION:")
        motion = motion.strip()
        if not motion:
            self.warning("Vote appears to be empty")
            return

        vote_top_row = [
            lines.index(x) for x in lines
            if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x)
        ][0]
        yea_columns_end = lines[vote_top_row].index("Yea") + len("Yea")
        nay_columns_begin = lines[vote_top_row].index("Nay")

        votes = {'yes': [], 'no': [], 'other': []}
        for line in lines[(vote_top_row + 1):]:
            if line.strip():
                member = re.search(
                    r'''(?x)
                        ^\s+(?:[A-Z\-]+)?\s+  # Possible vote indicator
                        ([A-Z][a-z]+  # Name must have lower-case characters
                        [\w\-\s]+)  # Continue looking for the rest of the name
                        (?:,[A-Z\s]+?)?  # Leadership has an all-caps title
                        (?:\s{2,}.*)?  # Name ends when many spaces are seen
                        ''', line).group(1)
                # Usually non-voting members won't even have a code listed
                # Only a couple of codes indicate an actual vote:
                # "VA" (vote after roll call) and "VC" (vote change)
                did_vote = bool(re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line))
                if did_vote:
                    # Check where the "X" or vote code is on the page
                    vote_column = len(line) - len(line.lstrip())
                    if vote_column <= yea_columns_end:
                        votes['yes'].append(member)
                    elif vote_column >= nay_columns_begin:
                        votes['no'].append(member)
                    else:
                        raise AssertionError(
                            "Unparseable vote found for {0} in {1}:\n{2}".
                            format(member, url, line))
                else:
                    votes['other'].append(member)

            # End loop as soon as no more members are found
            else:
                break

        totals = re.search(r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS',
                           text).groups()
        yes_count = int(totals[0])
        no_count = int(totals[1])
        passed = (yes_count > no_count)
        other_count = len(votes['other'])

        vote = Vote('upper', date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)
        vote['yes_votes'] = votes['yes']
        vote['no_votes'] = votes['no']
        vote['other_votes'] = votes['other']

        vote.validate()
        bill.add_vote(vote)
Example #55
0
    def scrape_floor_vote(self, chamber, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        MOTION_INDEX = 4
        TOTALS_INDEX = 6
        VOTE_START_INDEX = 9

        motion = lines[MOTION_INDEX].strip()
        # Sometimes there is no motion name, only "Passage" in the line above
        if (not motion
                and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")):
            motion = lines[MOTION_INDEX - 1]
            MOTION_INDEX -= 1
            TOTALS_INDEX -= 1
            VOTE_START_INDEX -= 1
        else:
            assert motion, "Floor vote's motion name appears to be empty"

        for _extra_motion_line in range(2):
            MOTION_INDEX += 1
            if lines[MOTION_INDEX].strip():
                motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip())
                TOTALS_INDEX += 1
                VOTE_START_INDEX += 1
            else:
                break

        (yes_count, no_count, other_count) = [
            int(x) for x in re.search(
                r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$',
                lines[TOTALS_INDEX]).groups()
        ]
        passed = (yes_count > no_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in lines[VOTE_START_INDEX:]:
            if not line.strip():
                break

            if " President " in line:
                line = line.replace(" President ", " ")
            elif " Speaker " in line:
                line = line.replace(" Speaker ", " ")

            # Votes follow the pattern of:
            # [vote code] [member name]-[district number]
            for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line):
                vote.yes(member)
            for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line):
                vote.no(member)
            for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line):
                vote.other(member)

        try:
            vote.validate()
        except ValueError:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.logger.info("Votes don't add up; looking for additional ones")
            for line in lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}',
                                         line):
                    vote.other(member)

        vote.validate()
        bill.add_vote(vote)
Example #56
0
    def scrape(self, chamber, session):
        chamber_name = 'house' if chamber == 'lower' else 'senate'
        session_slug = {
            '62': '62-2011',
            '63': '63-2013',
            '64': '64-2015',
            '65': '65-2017',
        }[session]

        # Open the index page of the session's Registers, and open each
        url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % (
            session_slug, chamber_name)
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            # Initialize information about the vote parsing
            results = {}
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""
            bills = []

            # Determine which URLs the information was pulled from
            pdf_url = pdf.attrib['href']

            try:
                (path, response) = self.urlretrieve(pdf_url)
            except requests.exceptions.ConnectionError:
                continue

            # Convert the PDF to text
            data = convert_pdf(path, type='text')
            os.unlink(path)

            # Determine the date of the document
            date = re.findall(date_re, data)
            if date:
                date = date[0][0]
                cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y")
            else:
                # If no date is found anywhere, do not process the document
                self.warning("No date was found for the document; skipping.")
                continue

            # Check each line of the text for motion and vote information
            lines = data.splitlines()
            for line in lines:

                # Ignore lines with no information
                if re.search(chamber_re, line) or \
                        re.search(date_re, line) or \
                        re.search(page_re, line) or \
                        line.strip() == "":
                    pass

                # Ensure that motion and vote capturing are not _both_ active
                elif in_motion and in_vote:
                    raise AssertionError(
                        "Scraper should not be simultaneously processing " +
                        "motion name and votes, as it is for this motion: " +
                        cur_motion)

                # Start capturing motion text after a ROLL CALL header
                elif not in_motion and not in_vote:
                    if line.strip() == "ROLL CALL":
                        in_motion = True

                elif in_motion and not in_vote:
                    if cur_motion == "":
                        cur_motion = line.strip()
                    else:
                        cur_motion = cur_motion + " " + line.strip()

                    # ABSENT AND NOT VOTING marks the end of each motion name
                    # In this case, prepare to capture votes
                    if line.strip().endswith("VOTING") or \
                            line.strip().endswith("VOTING."):
                        in_motion = False
                        in_vote = True

                elif not in_motion and in_vote:
                    # Ignore appointments and confirmations
                    if "The Senate advises and consents to the appointment" \
                            in line:
                        in_vote = False
                        cur_vote = None
                        results = {}
                        cur_motion = ""
                        bills = []

                    # If votes are being processed, record the voting members
                    elif ":" in line:
                        cur_vote, who = (x.strip() for x in line.split(":", 1))
                        who = [
                            x.strip() for x in who.split(';')
                            if x.strip() != ""
                        ]
                        results[cur_vote] = who

                        name_may_be_continued = False if line.endswith(";") \
                                else True

                    # Extracts bill numbers in the closing text
                    # used for when the closing text is multiple lines.
                    elif cur_vote is not None and\
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and \
                            not any(x in line.lower() for x in
                            ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']):
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))

                    elif cur_vote is not None and \
                            not any(x in line.lower() for x in
                            ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']):
                        who = [
                            x.strip() for x in line.split(";")
                            if x.strip() != ""
                        ]

                        if name_may_be_continued:
                            results[cur_vote][-1] = results[cur_vote][-1] + \
                                    " " + who.pop(0)

                        name_may_be_continued = False if line.endswith(";") \
                                else True

                        results[cur_vote].extend(who)

                    # At the conclusion of a vote, save its data
                    elif any(x in line.lower() for x in [
                            'passed', 'adopted', 'sustained', 'prevailed',
                            'lost', 'failed'
                    ]):

                        in_vote = False
                        cur_vote = None

                        # Identify what is being voted on
                        # Throw a warning if impropper informaiton found
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))

                        if bills == [] or cur_motion.strip() == "":
                            results = {}
                            cur_motion = ""
                            self.warning("No motion or bill name found: " +
                                         "motion name: " + cur_motion + "; " +
                                         "decision text: " + line.strip())
                            continue

                        # If votes are found in the motion name, throw an error
                        if "YEAS:" in cur_motion or "NAYS:" in cur_motion:
                            raise AssertionError(
                                "Vote data found in motion name: " +
                                cur_motion)

                        # Use the collected results to determine who voted how
                        keys = {
                            "YEAS": "yes",
                            "NAYS": "no",
                            "ABSENT AND NOT VOTING": "other"
                        }
                        res = {}
                        for key in keys:
                            if key in results:
                                res[keys[key]] = filter(
                                    lambda a: a != "", results[key])
                            else:
                                res[keys[key]] = []

                        # Count the number of members voting each way
                        yes, no, other = \
                                len(res['yes']), \
                                len(res['no']), \
                                len(res['other'])
                        chambers = {"H": "lower", "S": "upper", "J": "joint"}

                        # Almost all of the time, a vote only applies to one bill and this loop will only be run once.
                        # Some exceptions exist.

                        for bill in bills:

                            cur_bill_id = "%s%s%s %s" % bill

                            # Identify the source chamber for the bill
                            try:
                                bc = chambers[cur_bill_id[0]]
                            except KeyError:
                                bc = 'other'

                            # Determine whether or not the vote passed
                            if "over the governor's veto" in cur_motion.lower(
                            ):
                                VETO_SUPERMAJORITY = 2 / 3
                                passed = (yes /
                                          (yes + no) > VETO_SUPERMAJORITY)
                            else:
                                passed = (yes > no)

                            # Create a Vote object based on the scraped information
                            vote = Vote(chamber,
                                        cur_date,
                                        cur_motion,
                                        passed,
                                        yes,
                                        no,
                                        other,
                                        session=session,
                                        bill_id=cur_bill_id,
                                        bill_chamber=bc)

                            vote.add_source(pdf_url)
                            vote.add_source(url)

                            # For each category of voting members,
                            # add the individuals to the Vote object
                            for key in res:
                                obj = getattr(vote, key)
                                for person in res[key]:
                                    obj(person)

                            # Check the vote counts in the motion text against
                            # the parsed results
                            for category_name in keys.keys():
                                # Need to search for the singular, not plural, in the text
                                # so it can find, for example,  " 1 NAY "
                                vote_re = r"(\d+)\s{}".format(
                                    category_name[:-1])
                                motion_count = int(
                                    re.findall(vote_re, cur_motion)[0])
                                vote_count = vote[keys[category_name] +
                                                  "_count"]

                                if motion_count != vote_count:
                                    self.warning(
                                        "Motion text vote counts ({}) ".format(
                                            motion_count) +
                                        "differed from roll call counts ({}) ".
                                        format(vote_count) +
                                        "for {0} on {1}".format(
                                            category_name, cur_bill_id))
                                    vote[keys[category_name] +
                                         "_count"] = motion_count

                            self.save_vote(vote)

                        # With the vote successfully processed,
                        # wipe its data and continue to the next one
                        results = {}
                        cur_motion = ""
                        bills = []
Example #57
0
    def scrape_votes(self, url, motion, date, chamber):
        vote_pdf, resp = self.urlretrieve(url)
        text = convert_pdf(vote_pdf, 'text')
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []

        # point at array to add names to
        cur_array = None

        precursors = (
            ('Yeas--', yes_votes),
            ('Nays--', no_votes),
            ('Absent or those not voting--', other_votes),
            ('Absent and those not voting--', other_votes),
            ('Not Voting--', other_votes),
            ('Voting Present--', other_votes),
            ('Present--', other_votes),
            ('DISCLAIMER', None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.split('\n'))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line:
                    cur_array = arr
                    line = line.replace(pc, '')

            # split names
            for name in line.split(','):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if 'None.' in name:
                    cur_array = None
                match = re.match(r'(.+?)\. Total--.*', name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in ('on final passage', 'Necessary', 'who would have',
                             'being a tie', 'therefore', 'Vacancies', 'a pair',
                             'Total-', 'ATTORNEY', 'on final passage',
                             'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR',
                             'ARCHIVES', 'SECRETARY'):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == '.':
                        name = name[:-1]
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        other_count = len(other_votes)
        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote['yes_votes'] = yes_votes
        vote['no_votes'] = no_votes
        vote['other_votes'] = other_votes
        return vote