Esempio n. 1
0
    def _fix_house_text(self, filename):
        """
        TLDR: throw out bad text, replace it using different parser
        settings.

        When using `pdftotext` on the 2015 House committee list,
        the second and third columns of the second page get mixed up,
        which makes it very difficult to parse. Adding the `--layout`
        option fixes this, but isn't worth switching all parsing to
        that since the standard `pdftotext --nolayout` is easier in all
        other cases.

        The best solution to this is to throw out the offending text,
        and replace it with the correct text. The third and fourth
        columns are joint comittees that are scraped from the Senate
        document, so the only column that needs to be inserted this way
        is the second.
        """

        # Take the usable text from the normally-working parsing settings
        text = convert_pdf(filename, type="text-nolayout")
        assert "Revised: January 23, 2015" in text, (
            "House committee list has changed; check that the special-case"
            " fix is still necessary, and that the result is still correct"
        )
        text = re.sub(r"(?sm)Appropriations/F&C.*$", "", text)

        # Take the usable column from the alternate parser
        alternate_text = convert_pdf(filename, type="text")
        alternate_lines = alternate_text.split("\n")

        HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.)      "
        (text_of_line_to_replace,) = [
            x for x in alternate_lines if HEADER_OF_COLUMN_TO_REPLACE in x
        ]
        first_line_to_replace = alternate_lines.index(text_of_line_to_replace)
        first_character_to_replace = (
            alternate_lines[first_line_to_replace].index(HEADER_OF_COLUMN_TO_REPLACE)
            - 1
        )
        last_character_to_replace = first_character_to_replace + len(
            HEADER_OF_COLUMN_TO_REPLACE
        )

        column_lines_to_add = [
            x[first_character_to_replace:last_character_to_replace]
            for x in alternate_lines[first_line_to_replace + 1 :]
        ]
        column_text_to_add = "\n".join(column_lines_to_add)

        text = text + column_text_to_add
        return text
Esempio n. 2
0
 def scrape_vote_text(self, filelocation, local=False):
     """Retrieves or uses local copy of vote pdf and converts into XML."""
     if not local:
         try:
             filename, response = self.urlretrieve(url=filelocation)
             vote_text = convert_pdf(filename, type='xml')
             os.remove(filename)
         except scrapelib.HTTPError:
             self.warning('Request failed: {}'.format(filelocation))
             return
     else:
         vote_text = convert_pdf(filelocation, type='xml')
         os.remove(filelocation)
     return vote_text
Esempio n. 3
0
 def scrape_vote_text(self, filelocation, local=False):
     """Retrieves or uses local copy of vote pdf and converts into XML."""
     if not local:
         try:
             filename, response = self.urlretrieve(url=filelocation)
             vote_text = convert_pdf(filename, type='xml')
             os.remove(filename)
         except scrapelib.HTTPError:
             self.warning('Request failed: {}'.format(filelocation))
             return
     else:
         vote_text = convert_pdf(filelocation, type='xml')
         os.remove(filelocation)
     return vote_text
Esempio n. 4
0
    def _fix_house_text(self, filename):
        '''
        TLDR: throw out bad text, replace it using different parser
        settings.

        When using `pdftotext` on the 2015 House committee list,
        the second and third columns of the second page get mixed up,
        which makes it very difficult to parse. Adding the `--layout`
        option fixes this, but isn't worth switching all parsing to
        that since the standard `pdftotext --nolayout` is easier in all
        other cases.

        The best solution to this is to throw out the offending text,
        and replace it with the correct text. The third and fourth
        columns are joint comittees that are scraped from the Senate
        document, so the only column that needs to be inserted this way
        is the second.
        '''

        # Take the usable text from the normally-working parsing settings
        text = convert_pdf(filename, type='text-nolayout')
        assert "Revised: January 23, 2015" in text,\
            "House committee list has changed; check that the special-case"\
            " fix is still necessary, and that the result is still correct"
        text = re.sub(r'(?sm)Appropriations/F&C.*$', "", text)

        # Take the usable column from the alternate parser
        alternate_text = convert_pdf(filename, type='text')
        alternate_lines = alternate_text.split('\n')

        HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.)      "
        (text_of_line_to_replace, ) = [
            x for x in alternate_lines
            if HEADER_OF_COLUMN_TO_REPLACE in x
        ]
        first_line_to_replace = alternate_lines.index(text_of_line_to_replace)
        first_character_to_replace = alternate_lines[
            first_line_to_replace].index(HEADER_OF_COLUMN_TO_REPLACE) - 1
        last_character_to_replace = (first_character_to_replace +
                                     len(HEADER_OF_COLUMN_TO_REPLACE))

        column_lines_to_add = [
            x[first_character_to_replace:last_character_to_replace]
            for x in alternate_lines[first_line_to_replace + 1:]
        ]
        column_text_to_add = '\n'.join(column_lines_to_add)

        text = text + column_text_to_add
        return text
Esempio n. 5
0
    def __init__(self, url, resp, bill):
        self.url = url
        self.bill = bill

        # Fetch the document and put it into tempfile.
        fd, filename = tempfile.mkstemp()

        with open(filename, 'wb') as f:
            f.write(resp)

        # Convert it to text.
        try:
            text = convert_pdf(filename, type='text')
        except Exception:
            msg = "couldn't convert pdf."
            raise PDFCommitteeVoteParseError(msg)

        # Get rid of the temp file.
        os.close(fd)
        os.remove(filename)

        if not text.strip():
            msg = 'PDF file was empty.'
            raise PDFCommitteeVoteParseError(msg)

        self.text = '\n'.join([line.decode() for line in text.splitlines() if line])
Esempio n. 6
0
    def __init__(self, url, resp, bill):
        self.url = url
        self.bill = bill

        # Fetch the document and put it into tempfile.
        fd, filename = tempfile.mkstemp()

        with open(filename, 'wb') as f:
            f.write(resp)

        # Convert it to text.
        try:
            text = convert_pdf(filename, type='text')
        except Exception:
            msg = "couldn't convert pdf."
            raise PDFCommitteeVoteParseError(msg)

        # Get rid of the temp file.
        os.close(fd)
        os.remove(filename)

        if not text.strip():
            msg = 'PDF file was empty.'
            raise PDFCommitteeVoteParseError(msg)

        self.text = '\n'.join([line.decode() for line in text.splitlines() if line])
Esempio n. 7
0
    def scrape_committees_pdf(self, year, chamber, filename, url):
        if chamber == 'lower' and year == '2015':
            text = self._fix_house_text(filename).decode()
        else:
            text = convert_pdf(filename, type='text-nolayout').decode()

        for hotgarbage, replacement in (
            (r'Judicial Branch, Law Enforcement,\s+and\s+Justice',
                'Judicial Branch, Law Enforcement, and Justice'),

            (r'Natural Resources and\s+Transportation',
                'Natural Resources and Transportation'),

            (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications',
                'Federal Relations, Energy, and Telecommunications')
                ):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: 'Agriculture' not in s, lines)

        comm = None
        for line in lines:
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if 'Subcommittees' in line:
                self.warning("Currently, we're skipping subcommittees")
                # https://github.com/openstates/openstates/issues/2099
                break
            if is_committee_name(line):
                if comm and comm._related:
                    yield comm

                committee = line.strip()
                comm = Organization(name=committee, chamber=chamber,
                                    classification='committee')

                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit('(', 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(' Ch', party):
                    role = 'chair'
                elif ' VCh' in party:
                    role = 'vice chair'
                elif ' MVCh' in party:
                    role = 'minority vice chair'
                else:
                    role = 'member'
                comm.add_member(name, role)

        if comm._related:
            yield comm
Esempio n. 8
0
    def scrape_committees_pdf(self, year, chamber, filename, url):

        if chamber == 'lower' and year == '2015':
            text = self._fix_house_text(filename).decode()
        else:
            text = convert_pdf(filename, type='text-nolayout').decode()

        for hotgarbage, replacement in (
            (r'Judicial Branch, Law Enforcement,\s+and\s+Justice',
             'Judicial Branch, Law Enforcement, and Justice'),
            (r'Natural Resources and\s+Transportation',
             'Natural Resources and Transportation'),
            (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications',
             'Federal Relations, Energy, and Telecommunications')):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: 'Agriculture' not in s, lines)

        comm = None
        for line in lines:
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if 'Subcommittees' in line:
                self.warning("Currently, we're skipping subcommittees")
                # https://github.com/openstates/openstates/issues/2099
                break
            if is_committee_name(line):
                if comm and comm._related:
                    yield comm

                committee = line.strip()
                comm = Organization(name=committee,
                                    chamber=chamber,
                                    classification='committee')

                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit('(', 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(' Ch', party):
                    role = 'chair'
                elif ' VCh' in party:
                    role = 'vice chair'
                elif ' MVCh' in party:
                    role = 'minority vice chair'
                else:
                    role = 'member'
                comm.add_member(name, role)

        if comm._related:
            yield comm
Esempio n. 9
0
    def scrape_rollcall(self, vote, vurl):
        """
         Get text information from the pdf, containing the vote roll call
         and add the information obtained to the related voteEvent object
        :param vote:  related voteEvent object
        :param vurl:  pdf source url
        """
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, 'text')
        os.remove(path)

        current_vfunc = None
        option = None

        for line in pdflines.split(b'\n'):
            line = line.strip().decode()

            # change what is being recorded
            if line.startswith('YEAS') or line.startswith('AYES'):
                current_vfunc = vote.yes
            elif line.startswith('NAYS'):
                current_vfunc = vote.no
            elif line.startswith('EXCUSED'):
                current_vfunc = vote.vote
                option = 'excused'
            elif line.startswith('NOT VOTING'):
                current_vfunc = vote.vote
                option = 'excused'
            elif line.startswith('ABSTAIN'):
                current_vfunc = vote.vote
                option = 'excused'
            elif line.startswith('PAIRED'):
                current_vfunc = vote.vote
                option = 'paired'

            # skip these
            elif not line or line.startswith('Page '):
                continue

            # if a vfunc is active
            elif current_vfunc:
                # split names apart by 3 or more spaces
                names = re.split(r'\s{3,}', line)
                for name in names:
                    if name:
                        if not option:
                            current_vfunc(name.strip())
                        else:
                            current_vfunc(option=option,
                                          voter=name.strip())
Esempio n. 10
0
    def scrape_votes(self, url, motion, date, chamber, bill):
        vote_pdf, resp = self.urlretrieve(url)
        text = convert_pdf(vote_pdf, 'text')
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []
        absent_votes = []
        not_voting_votes = []
        # point at array to add names to
        cur_array = None

        precursors = (
            ('yeas--', yes_votes),
            ('nays--', no_votes),
            ('absent or those not voting--', absent_votes),
            ('absent and those not voting--', absent_votes),
            ('not voting--', not_voting_votes),
            ('voting present--', other_votes),
            ('present--', other_votes),
            ('disclaimer', None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.decode().split('\n'))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line.lower():
                    cur_array = arr
                    line = line.replace(pc, '')

            # split names
            for name in line.split(','):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if 'None.' in name:
                    cur_array = None

                match = re.match(r'(.+?)\. Total--.*', name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in ('on final passage', 'Necessary', 'who would have',
                             'being a tie', 'therefore', 'Vacancies', 'a pair',
                             'Total-', 'ATTORNEY', 'on final passage',
                             'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR',
                             'ARCHIVES', 'SECRETARY'):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == '.':
                        name = name[:-1]
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        absent_count = len(absent_votes)
        not_voting_count = len(not_voting_votes)
        other_count = len(other_votes)

        vote = VoteEvent(chamber=chamber,
                         start_date=self._tz.localize(date),
                         motion_text=motion,
                         result='pass' if passed else 'fail',
                         classification='passage',
                         bill=bill)
        vote.pupa_id = url + '#' + bill.identifier

        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('absent', absent_count)
        vote.set_count('not voting', not_voting_count)
        vote.set_count('other', other_count)
        vote.add_source(url)
        for yes_vote in yes_votes:
            vote.vote('yes', yes_vote)
        for no_vote in no_votes:
            vote.vote('no', no_vote)
        for absent_vote in absent_votes:
            vote.vote('absent', absent_vote)
        for not_voting_vote in not_voting_votes:
            vote.vote('not voting', not_voting_vote)
        for other_vote in other_votes:
            vote.vote('other', other_vote)
        yield vote
Esempio n. 11
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type='text').decode()
        lines = text.splitlines()

        if 'Senate' in vote_url:
            chamber = 'upper'
        else:
            chamber = 'lower'

        date_string = lines[0].split('Calendar Date:')[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if 'Yeas' in line and 'Nays' in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ['yes', 'no', 'not voting', 'excused', 'absent']

        if page_index:

            counts = re.split(r'\s{2,}', lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(' ', 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]
        motion = re.split(r'\s{2,}', lines[page_index - 3].strip())[0]
        motion_keywords = [
            'favorable', 'reading', 'amendment', 'motion', 'bill be introduced'
        ]

        if not any(motion_keyword in motion.lower()
                   for motion_keyword in motion_keywords):
            motion = re.split(r'\s{2,}', lines[page_index - 2].strip())[0]
        if not any(motion_keyword in motion.lower()
                   for motion_keyword in motion_keywords):
            self.error("Motion Extracted: %s" % motion)
            raise ValueError("No Motion or faulty Motion scraped.")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            classification='passage',
            result='pass' if passed else 'fail',
        )

        vote.pupa_id = vote_url  # contains sequence number

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            'Voting Nay', 'Not Voting', 'COPY', 'Excused',
            'indicates vote change'
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or 'Voting Yea' in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line
                   for show_stopper in show_stoppers):
                page_index += 1
                vote_index = (vote_index + 1)
                continue

            names = re.split(r'\s{2,}', current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Esempio n. 12
0
    def scrape_votes(self, url, motion, date, chamber, bill):
        vote_pdf, resp = self.urlretrieve(url)
        text = convert_pdf(vote_pdf, "text")
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []
        absent_votes = []
        not_voting_votes = []
        # point at array to add names to
        cur_array = None

        precursors = (
            ("yeas--", yes_votes),
            ("nays--", no_votes),
            ("absent or those not voting--", absent_votes),
            ("absent and those not voting--", absent_votes),
            ("not voting--", not_voting_votes),
            ("voting present--", other_votes),
            ("present--", other_votes),
            ("disclaimer", None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.decode().split("\n"))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line.lower():
                    cur_array = arr
                    line = line.replace(pc, "")

            # split names
            for name in line.split(","):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if "None." in name:
                    cur_array = None

                match = re.match(r"(.+?)\. Total--.*", name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in (
                        "on final passage",
                        "Necessary",
                        "who would have",
                        "being a tie",
                        "therefore",
                        "Vacancies",
                        "a pair",
                        "Total-",
                        "ATTORNEY",
                        "on final passage",
                        "SPEAKER",
                        "BOARD",
                        "TREASURER",
                        "GOVERNOR",
                        "ARCHIVES",
                        "SECRETARY",
                ):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == ".":
                        name = name[:-1]
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        absent_count = len(absent_votes)
        not_voting_count = len(not_voting_votes)
        other_count = len(other_votes)

        vote = VoteEvent(
            chamber=chamber,
            start_date=self._tz.localize(date),
            motion_text=motion,
            result="pass" if passed else "fail",
            classification="passage",
            bill=bill,
        )
        vote.pupa_id = url + "#" + bill.identifier

        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("absent", absent_count)
        vote.set_count("not voting", not_voting_count)
        vote.set_count("other", other_count)
        vote.add_source(url)
        for yes_vote in yes_votes:
            vote.vote("yes", yes_vote)
        for no_vote in no_votes:
            vote.vote("no", no_vote)
        for absent_vote in absent_votes:
            vote.vote("absent", absent_vote)
        for not_voting_vote in not_voting_votes:
            vote.vote("not voting", not_voting_vote)
        for other_vote in other_votes:
            vote.vote("other", other_vote)
        yield vote
Esempio n. 13
0
    def scrape_journal(self, url, chamber, session, date):

        filename, response = self.urlretrieve(url)
        self.logger.info('Saved journal to %r' % filename)
        all_text = convert_pdf(filename, type="text")

        lines = all_text.split(b'\n')
        lines = [line.decode('utf-8') for line in lines]
        lines = [line.
                 strip().
                 replace('–', '-').
                 replace('―', '"').
                 replace('‖', '"').
                 replace('“', '"').
                 replace('”', '"')
                 for line in lines]

        # Do not process headers or completely empty lines
        header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+"
        header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day"
        lines = iter([line for line in lines if not(
                     line == "" or
                     re.match(header_date_re, line) or
                     re.match(header_journal_re, line))])

        for line in lines:
            # Go through with vote parse if any of
            # these conditions match.
            if not line.startswith("On the question") or \
                    "shall" not in line.lower():
                continue

            # Get the bill_id
            bill_id = None
            bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)'

            # The Senate ends its motion text with a vote announcement
            if chamber == "upper":
                end_of_motion_re = r'.* the vote was:\s*'
            # The House may or may not end motion text with a bill name
            elif chamber == "lower":
                end_of_motion_re = r'.*Shall.*\?"?(\s{})?\s*'.format(bill_re)

            while not re.match(end_of_motion_re, line, re.IGNORECASE):
                line += " " + next(lines)

            try:
                bill_id = re.search(bill_re, line).group(1)
            except AttributeError:
                self.warning("This motion did not pertain to legislation: {}".
                             format(line))
                continue

            # Get the motion text
            motion_re = r'''
                    ^On\sthe\squestion\s  # Precedes any motion
                    "+  # Motion is preceded by a quote mark (or two)
                    (Shall\s.+?\??)  # The motion text begins with "Shall"
                    \s*"\s+  # Motion is followed by a quote mark
                    (?:{})?  # If the vote regards a bill, its number is listed
                    {}  # Senate has trailing text
                    \s*$
                    '''.format(
                    bill_re,
                    r',?.*?the\svote\swas:' if chamber == 'upper' else ''
                    )
            print(line)
            motion = re.search(motion_re,
                               line,
                               re.VERBOSE | re.IGNORECASE).group(1)

            for word, letter in (('Senate', 'S'),
                                 ('House', 'H'),
                                 ('File', 'F')):

                if bill_id is None:
                    return

                bill_id = bill_id.replace(word, letter)

            bill_id = bill_id.replace('.', '')

            bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]]
            self.current_id = bill_id
            votes, passed = self.parse_votes(lines)

            # at the very least, there should be a majority
            # for the bill to have passed, so check that,
            # but if the bill didn't pass, it could still be OK if it got a majority
            # eg constitutional amendments
            if not ((passed == (votes['yes_count'] > votes['no_count'])) or (not passed)):
                self.error("The bill passed without a majority?")
                raise ValueError('invalid vote')

            # also throw a warning if the bill failed but got a majority
            # it could be OK, but is probably something we'd want to check
            if not passed and votes['yes_count'] > votes['no_count']:
                self.logger.warning("The bill got a majority but did not pass. "
                                    "Could be worth confirming.")

            result = ""
            if passed:
                result = "pass"
            else:
                result = "fail"

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=re.sub('\xad', '-', motion),
                             result=result,
                             classification='passage',
                             legislative_session=session,
                             bill=bill_id,
                             bill_chamber=bill_chamber
                             )

            # add votes and counts
            for vtype in ('yes', 'no', 'absent', 'abstain'):
                vcount = votes['{}_count'.format(vtype)] or 0
                vote.set_count(vtype, vcount)
                for voter in votes['{}_votes'.format(vtype)]:
                    vote.vote(vtype, voter)

            vote.add_source(url)
            yield vote
Esempio n. 14
0
    def scrape_journal(self, session, url):
        journal, resp = self.urlretrieve(url)
        text = convert_pdf(journal, type='text').decode()
        lines = text.splitlines()

        #  state machine:
        #      None - undefined state
        #      question_quote - in question, looking for end quote
        #      pre-yes - vote is active, haven't hit yes votes yet
        #      yes     - yes votes
        #      no      - no votes
        #      other   - other votes
        state = None
        vote = None
        question = None
        date = None
        other_count = 0

        for line in lines:
            date_match = DATE_RE.findall(line)

            # skip headers
            if 'LEGISLATIVE JOURNAL' in line:
                continue

            elif date_match:
                date = datetime.datetime.strptime(' '.join(date_match[0]),
                                                  '%B %d %Y')
                continue

            # keep adding lines to question while quotes are open
            elif state == 'question_quote':
                question += ' %s' % line

            elif state in ('pre-yes', 'yes', 'no', 'other'):
                yes_match = YES_RE.match(line)
                no_match = NO_RE.match(line)
                other_match = NOT_VOTING_RE.match(line)
                if yes_match:
                    vote.set_count('yes', int(yes_match.group(1)))
                    state = 'yes'
                elif no_match:
                    vote.set_count('no', int(no_match.group(1)))
                    state = 'no'
                elif other_match:
                    other_count += int(other_match.group(1))
                    state = 'other'
                elif 'having voted in the affirmative' in line:
                    vote.set_count('other', other_count)
                    vote.result = 'pass'
                    state = None
                    vote.validate()
                    yield vote
                    vote = None
                    other_count = 0
                elif 'Having failed' in line:
                    vote.set_count('other', other_count)
                    vote.result = 'fail'
                    state = None
                    vote.validate()
                    yield vote
                    vote = None
                    other_count = 0
                elif line:
                    people = re.split('\s{3,}', line)
                    # try:
                    # except KeyError:
                    #     self.warning('line showed up in pre-yes state: %s',
                    #                  line)
                    for p in people:
                        if p:
                            # special cases for long name w/ 1 space
                            if p.startswith(
                                ('Lautenbaugh ', 'Langemeier ', 'McCollister ',
                                 'Pansing Brooks ', 'Schumacher ')):
                                p1, p2 = p.split(' ', 1)
                                vote.vote(state, p1)
                                vote.vote(state, p2)
                            else:
                                vote.vote(state, p)

            # check the text against our regexes
            bill_match = BILL_RE.match(line)
            veto_match = VETO_BILL_RE.findall(line)
            question_match = QUESTION_RE.findall(line)
            if bill_match:
                bill_type, bill_id = bill_match.groups()
                if bill_type == 'BILL':
                    bill_id = 'LB ' + bill_id
                elif bill_type == 'RESOLUTION':
                    bill_id = 'LR ' + bill_id
            elif question_match:
                question = question_match[0]
                state = 'question_quote'
            elif veto_match:
                bill_id = veto_match[0]

            # line just finished a question
            if state == 'question_quote' and QUESTION_MATCH_END in question:
                question = re.sub(
                    '\s+', ' ',
                    question.replace(QUESTION_MATCH_END, '').strip())

                if not bill_id:
                    raise Exception('cannot save vote without bill_id')

                # save prior vote
                vtuple = (bill_id, date, question)
                if vtuple in self._seen:
                    vote = None
                    continue
                else:
                    self._seen.add(vtuple)

                vote = VoteEvent(
                    bill=bill_id,
                    bill_chamber='legislature',
                    chamber='legislature',
                    legislative_session=session,
                    start_date=date.strftime('%Y-%m-%d'),
                    motion_text=question,
                    classification='passage',
                    result='fail',
                )
                vote.add_source(url)
                state = 'pre-yes'
                # reset bill_id and question
                bill_id = question = None
Esempio n. 15
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type='text').decode()
        lines = text.splitlines()

        if 'Senate' in vote_url:
            chamber = 'upper'
        else:
            chamber = 'lower'

        date_string = lines[0].split('Calendar Date:')[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if 'Yeas' in line and 'Nays' in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ['yes', 'no', 'not voting', 'excused', 'absent']

        if page_index:

            counts = re.split(r'\s{2,}', lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(' ', 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ['Consent Calendar' in line for line in lines[:page_index]])
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r'\s{2,}', lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r'\s{2,}',
                                              lines[page_index - 1].strip())
            assert consent_calendar_bills, "Could not find bills for consent calendar vote"

        motion_keywords = [
            'favorable', 'reading', 'amendment', 'motion', 'introduced',
            'bill pass', 'committee'
        ]
        motion_lines = [
            3, 2, 4, 5
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(motion_keyword in motion.lower()
                   for motion_keyword in motion_keywords):
                break
            motion = re.split(r'\s{2,}', lines[page_index - i].strip())[0]
        else:
            if not any(motion_keyword in motion.lower()
                       for motion_keyword in motion_keywords):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(motion_keyword in motion.lower()
                       for motion_keyword in motion_keywords):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            classification='passage',
            result='pass' if passed else 'fail',
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = '{}#{}'.format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            'Voting Nay', 'Not Voting', 'COPY', 'Excused',
            'indicates vote change'
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or 'Voting Yea' in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line
                   for show_stopper in show_stoppers):
                page_index += 1
                vote_index = (vote_index + 1)
                continue

            names = re.split(r'\s{2,}', current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Esempio n. 16
0
    def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy):
        result_types = {
            "FAILED": False,
            "DEFEATED": False,
            "PREVAILED": True,
            "PASSED": True,
            "SUSTAINED": True,
            "NOT SECONDED": False,
            "OVERRIDDEN": True,
            "ADOPTED": True,
        }

        for r in rollcalls:
            proxy_link = proxy["url"] + r["link"]

            try:
                (path, resp) = self.urlretrieve(proxy_link)
            except scrapelib.HTTPError as e:
                self.warning(e)
                self.warning(
                    "Unable to contact openstates proxy, skipping vote {}".format(
                        r["link"]
                    )
                )
                continue

            text = convert_pdf(path, "text").decode("utf-8")
            lines = text.split("\n")
            os.remove(path)

            chamber = (
                "lower" if "house of representatives" in lines[0].lower() else "upper"
            )
            date_parts = lines[1].strip().split()[-3:]
            date_str = " ".join(date_parts).title() + " " + lines[2].strip()

            vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p")
            vote_date = pytz.timezone("America/Indiana/Indianapolis").localize(
                vote_date
            )
            vote_date = vote_date.isoformat()

            passed = None

            for res, val in result_types.items():
                # We check multiple lines now because the result of the
                # roll call vote as parsed can potentially be split.
                # PDF documents suck.
                for line in lines[3:5]:
                    if res in line.upper():
                        passed = val
                        break

            if passed is None:
                raise AssertionError("Missing bill passage type")

            motion = " ".join(lines[4].split()[:-2])
            try:
                yeas = int(lines[4].split()[-1])
                nays = int(lines[5].split()[-1])
                excused = int(lines[6].split()[-1])
                not_voting = int(lines[7].split()[-1])
            except ValueError:
                self.logger.warning("Vote format is weird, skipping")
                continue

            vote = VoteEvent(
                chamber=chamber,
                legislative_session=session,
                bill=bill_id,
                bill_chamber=original_chamber,
                start_date=vote_date,
                motion_text=motion,
                result="pass" if passed else "fail",
                classification="passage",
            )

            vote.set_count("yes", yeas)
            vote.set_count("no", nays)
            vote.set_count("excused", excused)
            vote.set_count("not voting", not_voting)
            vote.add_source(proxy_link)

            currently_counting = ""

            possible_vote_lines = lines[8:]
            for line in possible_vote_lines:
                line = line.replace("NOT\xc2\xa0VOTING", "NOT VOTING")
                line = line.replace("\xc2\xa0", " -")
                if "yea-" in line.lower().replace(" ", ""):
                    currently_counting = "yes"
                elif "nay-" in line.lower().replace(" ", ""):
                    currently_counting = "no"
                elif "excused-" in line.lower().replace(" ", ""):
                    currently_counting = "excused"
                elif "notvoting-" in line.lower().replace(" ", ""):
                    currently_counting = "not voting"
                elif currently_counting == "":
                    pass
                elif re.search(r"v\. \d\.\d", line):
                    # this gets rid of the version number
                    # which is often found at the bottom of the doc
                    pass
                else:
                    voters = line.split("  ")
                    for v in voters:
                        if v.strip():
                            vote.vote(currently_counting, v.strip())

            yield vote
Esempio n. 17
0
    def scrape_lower(self):
        PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text-nolayout').decode()
        os.remove(path)

        days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                date = day[1]
            else:

                events = re.split(r'\n((?:\w+\s?)+)\n', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                    r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[ap]\.m\.)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?),\s  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''',
                                    event[1]).groups()
                        except AttributeError:
                            continue

                        time = time.replace(".", "").upper()
                        time = datetime.datetime.strptime(
                                time + "_" + date,
                                '%I:%M %p_%A, %B %d, %Y'
                                )
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                                x.strip() for x in
                                description.split('\n')
                                if x.strip() and not x.strip()[0].isdigit()
                                ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(
                                name=description,
                                start_date=time,
                                location_name=location,
                                description=description
                        )
                        event.add_source(PDF_URL)
                        event.add_participant(comm, type='committee', note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                                     line)
                            if related_bill:
                                (related_bill, relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
Esempio n. 18
0
    def scrape_lower(self):
        PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text-nolayout').decode()
        os.remove(path)

        days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                date = day[1]
            else:

                events = re.split(r'\n((?:\w+\s?)+)\n', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[ap]\.m\.)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?),\s  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''', event[1]).groups()
                        except AttributeError:
                            continue

                        time = time.replace(".", "").upper()
                        time = datetime.datetime.strptime(
                            time + "_" + date, '%I:%M %p_%A, %B %d, %Y')
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                            x.strip() for x in description.split('\n')
                            if x.strip() and not x.strip()[0].isdigit()
                        ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(name=description,
                                      start_date=time,
                                      location_name=location,
                                      description=description)
                        event.add_source(PDF_URL)
                        event.add_participant(comm,
                                              type='committee',
                                              note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(
                                r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
    def scrape_committees_pdf(self, year, chamber, filename, url):
        if chamber == 'lower' and year == '2015':
            text = self._fix_house_text(filename).decode()
        else:
            text = convert_pdf(filename, type='text-nolayout').decode()

        for hotgarbage, replacement in (
            (r'Judicial Branch, Law Enforcement,\s+and\s+Justice',
                'Judicial Branch, Law Enforcement, and Justice'),

            (r'Natural Resources and\s+Transportation',
                'Natural Resources and Transportation'),

            (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications',
                'Federal Relations, Energy, and Telecommunications')
                ):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: 'Agriculture' not in s, lines)

        comm = None
        in_senate_subcommittees = False
        for line in lines:
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if 'Subcommittees' in line:
                # These appear in both chambers' lists, so de-dup the scraping
                if chamber == 'lower':
                    break
                elif chamber == 'upper':
                    self.info("Beginning scrape of joint subcommittees")

                in_senate_subcommittees = True
                chamber = 'legislature'
                continue

            if is_committee_name(line):
                if comm and comm._related:
                    yield comm

                if in_senate_subcommittees:
                    committee = 'Joint Appropriations/Finance & Claims'
                    subcommittee = line.strip()
                    comm = Organization(name=subcommittee,
                                        parent_id={'name': committee,
                                                   'classification': 'joint'},
                                        classification='committee',
                                        )
                else:
                    committee = line.strip()
                    comm = Organization(name=committee, chamber=chamber,
                                        classification='committee')

                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit('(', 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(' Ch', party):
                    role = 'chair'
                elif ' VCh' in party:
                    role = 'vice chair'
                elif ' MVCh' in party:
                    role = 'minority vice chair'
                else:
                    role = 'member'
                comm.add_member(name, role)

        if comm._related:
            yield comm
Esempio n. 20
0
    def scrape_upper(self):
        PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text').decode()
        os.remove(path)

        days = re.split(r'(\w+day, \w+ \d{1,2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                # Calendar is put out for the current week, so use that year
                date = day[1] + ", " + str(datetime.datetime.now().year)
            else:

                events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[AP]M)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?)\n  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''', event[1]).groups()
                        except AttributeError:
                            continue

                        time = datetime.datetime.strptime(
                            time + "_" + date, '%I:%M %p_%A, %B %d, %Y')
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                            x.strip() for x in description.split('\n')
                            if x.strip() and not x.strip().startswith("Page ")
                            and not x.strip().startswith("*Possible Vote") and
                            not x.strip() == "NO OTHER COMMITTEES WILL MEET"
                        ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(name=description,
                                      start_date=time,
                                      location_name=location,
                                      description=description)

                        event.add_source(PDF_URL)
                        event.add_participant(comm,
                                              type='committee',
                                              note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(
                                r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
Esempio n. 21
0
 def _get_pdf(self, url):
     (path, response) = self.urlretrieve(url)
     data = convert_pdf(path, type="text")
     os.remove(path)
     return data
Esempio n. 22
0
    def _process_votes(self, rollcalls, bill_id, original_chamber, session,
                       proxy):
        result_types = {
            'FAILED': False,
            'DEFEATED': False,
            'PREVAILED': True,
            'PASSED': True,
            'SUSTAINED': True,
            'NOT SECONDED': False,
            'OVERRIDDEN': True,
            'ADOPTED': True,
        }

        for r in rollcalls:
            proxy_link = proxy["url"] + r["link"]
            (path, resp) = self.urlretrieve(proxy_link)
            text = convert_pdf(path, 'text').decode("utf-8")
            lines = text.split("\n")
            os.remove(path)

            chamber = "lower" if "house of representatives" in lines[0].lower(
            ) else "upper"
            date_parts = lines[1].strip().split()[-3:]
            date_str = " ".join(date_parts).title() + " " + lines[2].strip()

            vote_date = datetime.datetime.strptime(date_str,
                                                   "%b %d, %Y %I:%M:%S %p")
            vote_date = vote_date.strftime("%Y-%m-%d %H:%M:%S")

            passed = None

            for res, val in result_types.items():
                # We check multiple lines now because the result of the
                # roll call vote as parsed can potentially be split.
                # PDF documents suck.
                for line in lines[3:5]:
                    if res in line.upper():
                        passed = val
                        break

            if passed is None:
                raise AssertionError("Missing bill passage type")

            motion = " ".join(lines[4].split()[:-2])
            try:
                yeas = int(lines[4].split()[-1])
                nays = int(lines[5].split()[-1])
                excused = int(lines[6].split()[-1])
                not_voting = int(lines[7].split()[-1])
            except ValueError:
                self.logger.warning("Vote format is weird, skipping")
                continue

            vote = VoteEvent(chamber=chamber,
                             legislative_session=session,
                             bill=bill_id,
                             bill_chamber=original_chamber,
                             start_date=vote_date,
                             motion_text=motion,
                             result="pass" if passed else "fail",
                             classification="passage")

            vote.set_count('yes', yeas)
            vote.set_count('no', nays)
            vote.set_count('excused', excused)
            vote.set_count('not voting', not_voting)
            vote.add_source(proxy_link)

            currently_counting = ""

            possible_vote_lines = lines[8:]
            for l in possible_vote_lines:
                l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING")
                l = l.replace("\xc2\xa0", " -")
                if "yea-" in l.lower().replace(" ", ""):
                    currently_counting = "yes"
                elif "nay-" in l.lower().replace(" ", ""):
                    currently_counting = "no"
                elif "excused-" in l.lower().replace(" ", ""):
                    currently_counting = "excused"
                elif "notvoting-" in l.lower().replace(" ", ""):
                    currently_counting = "not voting"
                elif currently_counting == "":
                    pass
                elif re.search(r'v\. \d\.\d', l):
                    # this gets rid of the version number
                    # which is often found at the bottom of the doc
                    pass
                else:
                    voters = l.split("  ")
                    for v in voters:
                        if v.strip():
                            vote.vote(currently_counting, v.strip())

            yield vote
Esempio n. 23
0
    def scrape_upper(self):
        PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text').decode()
        os.remove(path)

        days = re.split(r'(\w+day, \w+ \d{1,2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                # Calendar is put out for the current week, so use that year
                date = day[1] + ", " + str(datetime.datetime.now().year)
            else:

                events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                    r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[AP]M)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?)\n  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''',
                                    event[1]).groups()
                        except AttributeError:
                            continue

                        time = datetime.datetime.strptime(
                                time + "_" + date,
                                '%I:%M %p_%A, %B %d, %Y'
                                )
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                                x.strip() for x in
                                description.split('\n')
                                if x.strip() and
                                not x.strip().startswith("Page ") and
                                not x.strip().startswith("*Possible Vote") and
                                not x.strip() == "NO OTHER COMMITTEES WILL MEET"
                                ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(
                                name=description,
                                start_date=time,
                                location_name=location,
                                description=description
                        )

                        event.add_source(PDF_URL)
                        event.add_participant(comm, type='committee', note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                                     line)
                            if related_bill:
                                (related_bill, relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
Esempio n. 24
0
    def scrape_journal(self, session, url):
        journal, resp = self.urlretrieve(url)
        text = convert_pdf(journal, type='text').decode()
        lines = text.splitlines()

        #  state machine:
        #      None - undefined state
        #      question_quote - in question, looking for end quote
        #      pre-yes - vote is active, haven't hit yes votes yet
        #      yes     - yes votes
        #      no      - no votes
        #      other   - other votes
        state = None
        vote = None
        question = None
        date = None

        for line_num, line in enumerate(lines):
            date_match = DATE_RE.findall(line)

            # skip headers
            if 'LEGISLATIVE JOURNAL' in line:
                continue

            elif date_match:
                date = datetime.datetime.strptime(' '.join(date_match[0]),
                                                  '%B %d %Y')
                continue

            # keep adding lines to question while quotes are open
            elif state == 'question_quote':
                question += ' %s' % line

            elif state in ('pre-yes', 'yes', 'no', 'other'):
                yes_match = YES_RE.match(line)
                no_match = NO_RE.match(line)
                other_match = NOT_VOTING_RE.match(line)
                if yes_match:
                    vote.set_count('yes', int(yes_match.group(1)))
                    state = 'yes'
                elif no_match:
                    vote.set_count('no', int(no_match.group(1)))
                    state = 'no'
                elif other_match:
                    vote.set_count('other', int(other_match.group(1)))
                    state = 'other'
                elif 'having voted in the affirmative' in line:
                    vote.result = 'pass'
                    state = None
                    vote.validate()
                    yield vote
                    vote = None
                elif 'Having failed' in line:
                    vote.result = 'fail'
                    state = None
                    vote.validate()
                    yield vote
                    vote = None
                elif line:
                    people = re.split('\s{3,}', line)
                    # try:
                    # except KeyError:
                    #     self.warning('line showed up in pre-yes state: %s',
                    #                  line)
                    for p in people:
                        if p:
                            # special cases for long name w/ 1 space
                            if p.startswith(('Lautenbaugh ', 'Langemeier ', 'McCollister ',
                                             'Pansing Brooks ', 'Schumacher ')):
                                p1, p2 = p.split(' ', 1)
                                vote.vote(state, p1)
                                vote.vote(state, p2)
                            else:
                                vote.vote(state, p)

            # check the text against our regexes
            bill_match = BILL_RE.match(line)
            veto_match = VETO_BILL_RE.findall(line)
            question_match = QUESTION_RE.findall(line)
            if bill_match:
                bill_type, bill_id = bill_match.groups()
                if bill_type == 'BILL':
                    bill_id = 'LB ' + bill_id
                elif bill_type == 'RESOLUTION':
                    bill_id = 'LR ' + bill_id
            elif question_match:
                question = question_match[0]
                state = 'question_quote'
            elif veto_match:
                bill_id = veto_match[0]

            # line just finished a question
            if state == 'question_quote' and QUESTION_MATCH_END in question:
                question = re.sub('\s+', ' ',
                                  question.replace(QUESTION_MATCH_END, '').strip())

                if not bill_id:
                    raise Exception('cannot save vote without bill_id')

                # save prior vote
                vtuple = (bill_id, date, question)
                if vtuple in self._seen:
                    vote = None
                    continue
                else:
                    self._seen.add(vtuple)

                vote = VoteEvent(
                    bill=bill_id,
                    bill_chamber='legislature',
                    chamber='legislature',
                    legislative_session=session,
                    start_date=date.strftime('%Y-%m-%d'),
                    motion_text=question,
                    classification='passage',
                    result='fail',
                )
                vote.add_source(url)
                state = 'pre-yes'
                # reset bill_id and question
                bill_id = question = None
Esempio n. 25
0
 def _get_pdf(self, url):
     (path, response) = self.urlretrieve(url)
     data = convert_pdf(path, type='text')
     os.remove(path)
     return data
Esempio n. 26
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type="text").decode()
        lines = text.splitlines()

        if "Senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        date_string = lines[0].split("Calendar Date:")[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if "Yeas" in line and "Nays" in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ["yes", "no", "not voting", "excused", "absent"]

        if page_index:

            counts = re.split(r"\s{2,}", lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(" ", 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ["Consent Calendar" in line for line in lines[:page_index]]
        )
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip())
            assert (
                consent_calendar_bills
            ), "Could not find bills for consent calendar vote"

        motion_keywords = [
            "favorable",
            "reading",
            "amendment",
            "motion",
            "introduced",
            "bill pass",
            "committee",
        ]
        motion_lines = [
            3,
            2,
            4,
            5,
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                break
            motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0]
        else:
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = "{}#{}".format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            "Voting Nay",
            "Not Voting",
            "COPY",
            "Excused",
            "indicates vote change",
            "Indicates Vote Change",
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or "Voting Yea" in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line for show_stopper in show_stoppers):
                page_index += 1
                vote_index = vote_index + 1
                continue

            names = re.split(r"\s{2,}", current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Esempio n. 27
0
    def scrape_journal(self, url, chamber, session, date):

        filename, response = self.urlretrieve(url)
        self.logger.info('Saved journal to %r' % filename)
        all_text = convert_pdf(filename, type="text")

        lines = all_text.split(b'\n')
        lines = [line.decode('utf-8') for line in lines]
        lines = [
            line.strip().replace('–', '-').replace('―', '"').replace(
                '‖', '"').replace('“', '"').replace('”', '"') for line in lines
        ]

        # Do not process headers or completely empty lines
        header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+"
        header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day"
        lines = iter([
            line for line in lines
            if not (line == "" or re.match(header_date_re, line)
                    or re.match(header_journal_re, line))
        ])

        for line in lines:
            # Go through with vote parse if any of
            # these conditions match.
            if not line.startswith("On the question") or \
                    "shall" not in line.lower():
                continue

            # Get the bill_id
            bill_id = None
            bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)'

            # The Senate ends its motion text with a vote announcement
            if chamber == "upper":
                end_of_motion_re = r'.* the vote was:\s*'
            # The House may or may not end motion text with a bill name
            elif chamber == "lower":
                end_of_motion_re = r'.*Shall.*\?"?(\s{})?\s*'.format(bill_re)

            while not re.match(end_of_motion_re, line, re.IGNORECASE):
                line += " " + next(lines)

            try:
                bill_id = re.search(bill_re, line).group(1)
            except AttributeError:
                self.warning(
                    "This motion did not pertain to legislation: {}".format(
                        line))
                continue

            # Get the motion text
            motion_re = r'''
                    ^On\sthe\squestion\s  # Precedes any motion
                    "+  # Motion is preceded by a quote mark (or two)
                    (Shall\s.+?\??)  # The motion text begins with "Shall"
                    \s*"\s+  # Motion is followed by a quote mark
                    (?:{})?  # If the vote regards a bill, its number is listed
                    {}  # Senate has trailing text
                    \s*$
                    '''.format(
                bill_re, r',?.*?the\svote\swas:' if chamber == 'upper' else '')
            print(line)
            motion = re.search(motion_re, line,
                               re.VERBOSE | re.IGNORECASE).group(1)

            for word, letter in (('Senate', 'S'), ('House', 'H'), ('File',
                                                                   'F')):

                if bill_id is None:
                    return

                bill_id = bill_id.replace(word, letter)

            bill_id = bill_id.replace('.', '')

            bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]]
            votes, passed = self.parse_votes(lines)

            # at the very least, there should be a majority
            # for the bill to have passed, so check that,
            # but if the bill didn't pass, it could still be OK if it got a majority
            # eg constitutional amendments
            if not ((passed == (votes['yes_count'] > votes['no_count'])) or
                    (not passed)):
                self.error("The bill passed without a majority?")
                raise ValueError('invalid vote')

            # also throw a warning if the bill failed but got a majority
            # it could be OK, but is probably something we'd want to check
            if not passed and votes['yes_count'] > votes['no_count']:
                self.logger.warning(
                    "The bill got a majority but did not pass. "
                    "Could be worth confirming.")

            result = ""
            if passed:
                result = "pass"
            else:
                result = "fail"

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=re.sub('\xad', '-', motion),
                             result=result,
                             classification='passage',
                             legislative_session=session,
                             bill=bill_id,
                             bill_chamber=bill_chamber)

            # add votes and counts
            for vtype in ('yes', 'no', 'absent', 'abstain'):
                vcount = votes['{}_count'.format(vtype)] or 0
                vote.set_count(vtype, vcount)
                for voter in votes['{}_votes'.format(vtype)]:
                    vote.vote(vtype, voter)

            vote.add_source(url)
            yield vote
Esempio n. 28
0
    def scrape_votes(self, url, motion, date, chamber, bill):
        vote_pdf, resp = self.urlretrieve(url)
        text = convert_pdf(vote_pdf, 'text')
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []
        absent_votes = []
        not_voting_votes = []
        # point at array to add names to
        cur_array = None

        precursors = (
            ('yeas--', yes_votes),
            ('nays--', no_votes),
            ('absent or those not voting--', absent_votes),
            ('absent and those not voting--', absent_votes),
            ('not voting--', not_voting_votes),
            ('voting present--', other_votes),
            ('present--', other_votes),
            ('disclaimer', None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.decode().split('\n'))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line.lower():
                    cur_array = arr
                    line = line.replace(pc, '')

            # split names
            for name in line.split(','):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if 'None.' in name:
                    cur_array = None

                match = re.match(r'(.+?)\. Total--.*', name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in ('on final passage', 'Necessary', 'who would have',
                             'being a tie', 'therefore', 'Vacancies', 'a pair',
                             'Total-', 'ATTORNEY', 'on final passage',
                             'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR',
                             'ARCHIVES', 'SECRETARY'):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == '.':
                        name = name[:-1]
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        absent_count = len(absent_votes)
        not_voting_count = len(not_voting_votes)
        other_count = len(other_votes)

        vote = VoteEvent(chamber=chamber,
                         start_date=self._tz.localize(date),
                         motion_text=motion,
                         result='pass' if passed else 'fail',
                         classification='passage',
                         bill=bill)
        vote.pupa_id = url + '#' + bill.identifier

        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('absent', absent_count)
        vote.set_count('not voting', not_voting_count)
        vote.set_count('other', other_count)
        vote.add_source(url)
        for yes_vote in yes_votes:
            vote.vote('yes', yes_vote)
        for no_vote in no_votes:
            vote.vote('no', no_vote)
        for absent_vote in absent_votes:
            vote.vote('absent', absent_vote)
        for not_voting_vote in not_voting_votes:
            vote.vote('not voting', not_voting_vote)
        for other_vote in other_votes:
            vote.vote('other', other_vote)
        yield vote
Esempio n. 29
0
    def scrape_votes(self, vote_url, bill, chamber):
        # Grabs text from pdf
        pdflines = [
            line.decode("utf-8")
            for line in convert_pdf(vote_url, "text").splitlines()
        ]
        vote_date = 0
        voters = defaultdict(list)
        for x in range(len(pdflines)):
            line = pdflines[x]
            if re.search(r"(\d+/\d+/\d+)", line):
                initial_date = line.strip()
            if ("AM" in line) or ("PM" in line):
                split_l = line.split()
                for y in split_l:
                    if ":" in y:
                        time_location = split_l.index(y)
                        motion = " ".join(split_l[0:time_location])
                        time = split_l[time_location:]
                        if len(time) > 0:
                            time = "".join(time)
                        dt = initial_date + " " + time
                        dt = datetime.strptime(dt, "%m/%d/%Y %I:%M:%S%p")
                        vote_date = central.localize(dt)
                        vote_date = vote_date.isoformat()
                        # In rare case that no motion is provided
                        if len(motion) < 1:
                            motion = "No Motion Provided"
            if "YEAS:" in line:
                yeas = int(line.split()[-1])
            if "NAYS:" in line:
                nays = int(line.split()[-1])
            if "ABSTAINED:" in line:
                abstained = int(line.split()[-1])
            if "PASSES:" in line:
                abstained = int(line.split()[-1])
            if "NOT VOTING:" in line:
                not_voting = int(line.split()[-1])

            if "YEAS :" in line:
                y = 0
                next_line = pdflines[x + y]
                while "NAYS : " not in next_line:
                    next_line = next_line.split("  ")
                    if next_line and ("YEAS" not in next_line):
                        for v in next_line:
                            if v and "YEAS" not in v:
                                voters["yes"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1
            if line and "NAYS :" in line:
                y = 0
                next_line = 0
                next_line = pdflines[x + y]
                while ("ABSTAINED : " not in next_line) and ("PASSES :"
                                                             not in next_line):
                    next_line = next_line.split("  ")
                    if next_line and "NAYS" not in next_line:
                        for v in next_line:
                            if v and "NAYS" not in v:
                                voters["no"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1

            if line and ("ABSTAINED :" in line or "PASSES :" in line):
                y = 2
                next_line = 0
                next_line = pdflines[x + y]
                while "NOT VOTING :" not in next_line:
                    next_line = next_line.split("  ")
                    if next_line and ("ABSTAINED" not in next_line
                                      or "PASSES" not in next_line):
                        for v in next_line:
                            if v:
                                voters["abstain"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1

            if line and "NOT VOTING : " in line:
                lines_to_go_through = math.ceil(not_voting / len(line.split()))
                next_line = pdflines[x]
                for y in range(lines_to_go_through):
                    next_line = pdflines[x + y + 2].split("  ")
                    for v in next_line:
                        if v:
                            voters["not voting"].append(v.strip())
                if yeas > (nays + abstained + not_voting):
                    passed = True
                else:
                    passed = False

                ve = VoteEvent(
                    chamber=chamber,
                    start_date=vote_date,
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    classification="bill",
                    bill=bill,
                )
                ve.add_source(vote_url)
                for how_voted, how_voted_voters in voters.items():
                    for voter in how_voted_voters:
                        if len(voter) > 0:
                            ve.vote(how_voted, voter)
                # Resets voters dictionary before going onto next page in pdf
                voters = defaultdict(list)
                yield ve
Esempio n. 30
0
    def scrape_digest(self, bill, chamber):
        digest_url = 'http://legisweb.state.wy.us/{}/Digest/{}.pdf'.format(
            bill.legislative_session,
            bill.identifier,
        )
        bill.add_source(digest_url)

        try:
            (filename, response) = self.urlretrieve(digest_url)
            all_text = convert_pdf(filename, type='text').decode()
        except scrapelib.HTTPError:
            self.warning('no digest for %s' % bill.identifier)
            return
        if all_text.strip() == "":
            self.warning(
                'Non-functional digest for bill {}'.
                format(bill.identifier)
            )
            return

        # Split the digest's text into sponsors, description, and actions
        SPONSOR_RE = r'(?sm)Sponsored By:\s+(.*?)\n\n'
        DESCRIPTION_RE = r'(?sm)\n\n((?:AN\s*?ACT|A JOINT RESOLUTION) .*?)\n\n'

        try:
            ext_title = re.search(DESCRIPTION_RE, all_text).group(1)
        except AttributeError:
            ext_title = ''
        bill_desc = ext_title.replace('\n', ' ')
        bill_desc = re.sub("  *", " ", bill_desc)
        if bill_desc:
            bill.add_abstract(abstract=bill_desc, note='description')

        sponsor_span = re.search(SPONSOR_RE, all_text).group(1)
        sponsors = ''
        sponsors = sponsor_span.replace('\n', ' ')
        if sponsors:
            if 'Committee' in sponsors:
                bill.add_sponsorship(sponsors, 'primary', primary=True, entity_type='organization')
            else:
                if chamber == 'lower':
                    sp_lists = sponsors.split('and Senator(s)')
                else:
                    sp_lists = sponsors.split('and Representative(s)')
                for spl in sp_lists:
                    for sponsor in split_names(spl):
                        sponsor = sponsor.strip()
                        if sponsor != "":
                            bill.add_sponsorship(sponsor, 'primary', primary=True,
                                                 entity_type='person')

        action_re = re.compile('(\d{1,2}/\d{1,2}/\d{4})\s+(H |S )?(.+)')
        vote_total_re = re.compile('(Ayes )?(\d*)(\s*)Nays(\s*)(\d+)(\s*)Excused(\s*)(\d+)'
                                   '(\s*)Absent(\s*)(\d+)(\s*)Conflicts(\s*)(\d+)')

        # initial actor is bill chamber
        actor = chamber

        lines = all_text.splitlines()
        for idx, line in enumerate(lines):
            if action_re.search(line):
                break
        action_lines = lines[idx:]

        for line in action_lines:
            line = clean_line(line)

            # skip blank lines
            if not line:
                continue

            amatch = action_re.match(line)
            if amatch:
                date, achamber, action = amatch.groups()

                # change actor if one is on this action
                if achamber == 'H ':
                    actor = 'lower'
                elif achamber == 'S ':
                    actor = 'upper'

                date = datetime.datetime.strptime(date, '%m/%d/%Y')
                bill.add_action(action.strip(), TIMEZONE.localize(date), chamber=actor,
                                classification=categorize_action(action))
            elif line == 'ROLL CALL':
                voters = defaultdict(str)
                # if we hit a roll call, use an inner loop to consume lines
                # in a psuedo-state machine manner, 3 types
                # Ayes|Nays|Excused|... - indicates next line is voters
                # : (Senators|Representatives): ... - voters
                # \d+ Nays \d+ Excused ... - totals
                voters_type = None
                for ainext in action_lines:
                    nextline = clean_line(ainext)
                    if not nextline:
                        continue

                    breakers = ["Ayes:", "Nays:", "Nayes:", "Excused:",
                                "Absent:", "Conflicts:"]

                    for breaker in breakers:
                        if nextline.startswith(breaker):
                            voters_type = breaker[:-1]
                            if voters_type == "Nayes":
                                voters_type = "Nays"
                                self.log("Fixed a case of 'Naye-itis'")
                            nextline = nextline[len(breaker) - 1:]

                    if nextline.startswith(': '):
                        voters[voters_type] = nextline
                    elif nextline in ('Ayes', 'Nays', 'Excused', 'Absent',
                                      'Conflicts'):
                        voters_type = nextline
                    elif vote_total_re.match(nextline):
                        # _, ayes, _, nays, _, exc, _, abs, _, con, _ = \
                        tup = vote_total_re.match(nextline).groups()
                        ayes = tup[1]
                        nays = tup[4]
                        exc = tup[7]
                        abs = tup[10]
                        con = tup[13]

                        passed = (('Passed' in action or
                                   'Do Pass' in action or
                                   'Did Concur' in action or
                                   'Referred to' in action) and
                                  'Failed' not in action)
                        vote = VoteEvent(
                            chamber=chamber,
                            start_date=TIMEZONE.localize(date),
                            motion_text=action,
                            result='pass' if passed else 'fail',
                            classification='passage',
                            bill=bill,
                        )
                        vote.set_count('yes', int(ayes))
                        vote.set_count('no', int(nays))
                        vote.set_count('other', int(exc) + int(abs) + int(con))
                        vote.add_source(digest_url)

                        for vtype, voters in voters.items():
                            for voter in split_names(voters):
                                if voter:
                                    if vtype == 'Ayes':
                                        vote.vote('yes', voter)
                                    elif vtype == 'Nays':
                                        vote.vote('no', voter)
                                    else:
                                        vote.vote('other', voter)
                        # done collecting this vote
                        yield vote
                        break
                    else:
                        # if it is a stray line within the vote, is is a
                        # continuation of the voter list
                        # (sometimes has a newline)
                        voters[voters_type] += ' ' + nextline
Esempio n. 31
0
    def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy):
        result_types = {
            'FAILED': False,
            'DEFEATED': False,
            'PREVAILED': True,
            'PASSED': True,
            'SUSTAINED': True,
            'NOT SECONDED': False,
            'OVERRIDDEN': True,
            'ADOPTED': True,
        }

        for r in rollcalls:
            proxy_link = proxy["url"] + r["link"]
            (path, resp) = self.urlretrieve(proxy_link)
            text = convert_pdf(path, 'text').decode("utf-8")
            lines = text.split("\n")
            os.remove(path)

            chamber = "lower" if "house of representatives" in lines[0].lower() else "upper"
            date_parts = lines[1].strip().split()[-3:]
            date_str = " ".join(date_parts).title() + " " + lines[2].strip()

            vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p")
            vote_date = pytz.timezone('America/Indiana/Indianapolis').localize(vote_date)
            vote_date = vote_date.isoformat()

            passed = None

            for res, val in result_types.items():
                # We check multiple lines now because the result of the
                # roll call vote as parsed can potentially be split.
                # PDF documents suck.
                for line in lines[3:5]:
                    if res in line.upper():
                        passed = val
                        break

            if passed is None:
                raise AssertionError("Missing bill passage type")

            motion = " ".join(lines[4].split()[:-2])
            try:
                yeas = int(lines[4].split()[-1])
                nays = int(lines[5].split()[-1])
                excused = int(lines[6].split()[-1])
                not_voting = int(lines[7].split()[-1])
            except ValueError:
                self.logger.warning("Vote format is weird, skipping")
                continue

            vote = VoteEvent(chamber=chamber,
                             legislative_session=session,
                             bill=bill_id,
                             bill_chamber=original_chamber,
                             start_date=vote_date,
                             motion_text=motion,
                             result="pass" if passed else "fail",
                             classification="passage")

            vote.set_count('yes', yeas)
            vote.set_count('no', nays)
            vote.set_count('excused', excused)
            vote.set_count('not voting', not_voting)
            vote.add_source(proxy_link)

            currently_counting = ""

            possible_vote_lines = lines[8:]
            for l in possible_vote_lines:
                l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING")
                l = l.replace("\xc2\xa0", " -")
                if "yea-" in l.lower().replace(" ", ""):
                    currently_counting = "yes"
                elif "nay-" in l.lower().replace(" ", ""):
                    currently_counting = "no"
                elif "excused-" in l.lower().replace(" ", ""):
                    currently_counting = "excused"
                elif "notvoting-" in l.lower().replace(" ", ""):
                    currently_counting = "not voting"
                elif currently_counting == "":
                    pass
                elif re.search(r'v\. \d\.\d', l):
                    # this gets rid of the version number
                    # which is often found at the bottom of the doc
                    pass
                else:
                    voters = l.split("  ")
                    for v in voters:
                        if v.strip():
                            vote.vote(currently_counting, v.strip())

            yield vote
Esempio n. 32
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type='text').decode()
        lines = text.splitlines()

        if 'Senate' in vote_url:
            chamber = 'upper'
        else:
            chamber = 'lower'

        date_string = lines[0].split('Calendar Date:')[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if 'Yeas' in line and 'Nays' in line:
                page_index = index
                break

        vote_counts = 5*[0]
        vote_types = ['yes', 'no', 'not voting', 'excused', 'absent']

        if page_index:

            counts = re.split(r'\s{2,}', lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(' ', 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(['Consent Calendar' in line for line in lines[:page_index]])
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r'\s{2,}', lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r'\s{2,}', lines[page_index-1].strip())
            assert consent_calendar_bills, "Could not find bills for consent calendar vote"

        motion_keywords = ['favorable', 'reading', 'amendment', 'motion', 'introduced',
                           'bill pass', 'committee']
        motion_lines = [3, 2, 4, 5]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(motion_keyword in motion.lower() for motion_keyword in motion_keywords):
                break
            motion = re.split(r'\s{2,}', lines[page_index-i].strip())[0]
        else:
            if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index-3]
            if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            classification='passage',
            result='pass' if passed else 'fail',
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = '{}#{}'.format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = ['Voting Nay', 'Not Voting',
                         'COPY', 'Excused', 'indicates vote change',
                         'Indicates Vote Change']
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5*[0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or 'Voting Yea' in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line for show_stopper in show_stoppers):
                page_index += 1
                vote_index = (vote_index + 1)
                continue

            names = re.split(r'\s{2,}', current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote