Beispiel #1
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = Vote(
            chamber='upper',
            start_date=date.strftime("%Y-%m-%d"),
            motion_text='Passage',
            # setting 'fail' for now.
            result='fail',
            classification='passage',
            bill=bill
        )
        vote.add_source(url)

        text = convert_pdf(filename, 'text').decode('utf-8')
        os.remove(filename)

        if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url, date)
            return

        data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1]
        data = filter(None, data)
        keymap = dict(yea='yes', nay='no')
        actual_vote = collections.defaultdict(int)
        vote_count = {
            'yes': 0,
            'no': 0,
            'other': 0
        }
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), 'other')
            values = data.pop()
            for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values):
                if name.lower().strip() == 'none.':
                    continue
                name = name.replace('..', '')
                name = re.sub(r'\.$', '', name)
                name = name.strip('-1234567890 \n')
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] +
                                                     vote_count['other']) else 'fail'

        yield vote
Beispiel #2
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = Vote(
            chamber='upper',
            start_date=date.strftime("%Y-%m-%d"),
            motion_text='Passage',
            # setting 'fail' for now.
            result='fail',
            classification='passage',
            bill=bill
        )
        vote.add_source(url)

        text = convert_pdf(filename, 'text').decode('utf-8')
        os.remove(filename)

        if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url, date)
            return

        data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1]
        data = filter(None, data)
        keymap = dict(yea='yes', nay='no')
        actual_vote = collections.defaultdict(int)
        vote_count = {
            'yes': 0,
            'no': 0,
            'other': 0
        }
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), 'other')
            values = data.pop()
            for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values):
                if name.lower().strip() == 'none.':
                    continue
                name = name.replace('..', '')
                name = re.sub(r'\.$', '', name)
                name = name.strip('-1234567890 \n')
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] +
                                                     vote_count['other']) else 'fail'

        yield vote
Beispiel #3
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = VoteEvent(
            chamber="upper",
            start_date=date.strftime("%Y-%m-%d"),
            motion_text="Passage",
            # setting 'fail' for now.
            result="fail",
            classification="passage",
            bill=bill,
        )
        vote.add_source(url)
        vote.pupa_id = url

        text = convert_pdf(filename, "text").decode("utf-8")
        os.remove(filename)

        if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url, date)
            return

        data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1]
        data = filter(None, data)
        keymap = dict(yea="yes", nay="no")
        actual_vote = collections.defaultdict(int)
        vote_count = {"yes": 0, "no": 0, "other": 0}
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), "other")
            values = data.pop()
            for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values):
                if name.lower().strip() == "none.":
                    continue
                name = name.replace("..", "")
                name = re.sub(r"\.$", "", name)
                name = name.strip("-1234567890 \n")
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = (
            "pass"
            if vote_count["yes"] > (vote_count["no"] + vote_count["other"])
            else "fail"
        )

        yield vote
Beispiel #4
0
 def get_house_pdf(self, vurl):
     """ cache house PDFs since they are done by year """
     if vurl not in self.house_pdf_cache:
         (path, resp) = self.urlretrieve(vurl)
         pdflines = convert_pdf(path, 'text')
         os.remove(path)
         self.house_pdf_cache[vurl] = pdflines.decode('utf-8').replace(u'\u2019', "'")
     return self.house_pdf_cache[vurl]
Beispiel #5
0
 def fetch_pdf_lines(self, href):
     # download the file
     try:
         fname, resp = self.urlretrieve(href)
         pdflines = [line.decode('utf-8') for line in convert_pdf(fname, 'text').splitlines()]
         os.remove(fname)
         return pdflines
     except scrapelib.HTTPError as e:
         assert '404' in e.args[0], "File not found: {}".format(e)
         self.warning("404 error for vote; skipping vote")
         return False
Beispiel #6
0
 def fetch_pdf_lines(self, href):
     # download the file
     try:
         fname, resp = self.urlretrieve(href)
         pdflines = [line.decode('utf-8') for line in convert_pdf(fname, 'text').splitlines()]
         os.remove(fname)
         return pdflines
     except scrapelib.HTTPError as e:
         assert '404' in e.args[0], "File not found: {}".format(e)
         self.warning("404 error for vote; skipping vote")
         return False
Beispiel #7
0
    def _load_emails_from_directory_pdf(self):
        """
        Load the house PDF directory and convert to LXML - needed to
        find email addresses which are gone from the website.
        """
        with tempfile.NamedTemporaryFile() as temp:
            self.scraper.urlretrieve(self.directory_pdf_url, temp.name)
            directory = lxml.etree.fromstring(convert_pdf(temp.name, 'xml'))

        # pull out member email addresses from the XML salad produced
        # above - there's no obvious way to match these to names, but
        # fortunately they have names in them
        return set(directory.xpath(
            '//text[contains(text(), "@myfloridahouse.gov")]/text()'))
Beispiel #8
0
    def scrape_senate_vote(self, vote, vurl):
        # download file to server
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, 'text')
        os.remove(path)

        # for y, n
        mode = None

        lines = pdflines.splitlines()

        # handle individual lines in pdf to id legislator votes
        for line in lines:
            line = line.strip()
            line = line.decode('utf-8').replace(u'\u2212', '-')
            if line == '':
                continue
            # change mode accordingly
            elif line.startswith('YEAS'):
                mode = 'y'
            elif line.startswith('NAYS'):
                mode = 'n'
            elif line.startswith('ABSENT OR'):
                mode = 'o'
            # else parse line with names
            else:
                nameline = line.split('   ')

                for raw_name in nameline:
                    raw_name = raw_name.strip()
                    if raw_name == '':
                        continue

                    # handles vote count lines
                    cut_name = raw_name.split('-')
                    clean_name = ''
                    if cut_name[len(cut_name) - 1].strip(' .').isdigit():
                        del cut_name[-1]
                        clean_name = ''.join(cut_name)
                    else:
                        clean_name = raw_name.strip()
                    # update vote object with names
                    if mode == 'y':
                        vote.yes(clean_name)
                    elif mode == 'n':
                        vote.no(clean_name)
                    elif mode == 'o':
                        vote.vote('other', clean_name)
Beispiel #9
0
    def scrape_senate_vote(self, vote, vurl):
        # download file to server
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, "text")
        os.remove(path)

        # for y, n
        mode = None

        lines = pdflines.splitlines()

        # handle individual lines in pdf to id legislator votes
        for line in lines:
            line = line.strip()
            line = line.decode("utf-8").replace(u"\u2212", "-")
            if line == "":
                continue
            # change mode accordingly
            elif line.startswith("YEAS"):
                mode = "y"
            elif line.startswith("NAYS"):
                mode = "n"
            elif line.startswith("ABSENT OR"):
                mode = "o"
            # else parse line with names
            else:
                nameline = line.split("   ")

                for raw_name in nameline:
                    raw_name = raw_name.strip()
                    if raw_name == "":
                        continue

                    # handles vote count lines
                    cut_name = raw_name.split("-")
                    clean_name = ""
                    if cut_name[len(cut_name) - 1].strip(" .").isdigit():
                        del cut_name[-1]
                        clean_name = "".join(cut_name)
                    else:
                        clean_name = raw_name.strip()
                    # update vote object with names
                    if mode == "y":
                        vote.yes(clean_name)
                    elif mode == "n":
                        vote.no(clean_name)
                    elif mode == "o":
                        vote.vote("other", clean_name)
Beispiel #10
0
 def pdf_to_lxml(self):
     filename, resp = self.scraper.urlretrieve(self.url)
     text = convert_pdf(filename, "html")
     return lxml.html.fromstring(text)
Beispiel #11
0
 def pdf_to_lxml(self, filename, type='html'):
     text = convert_pdf(filename, type)
     return lxml.html.fromstring(text)
Beispiel #12
0
    def scrape_upper_committee(self, url):
        filename, resp = self.urlretrieve(url)
        lines = convert_pdf(filename, 'text').split('\n')
        comm = None
        comm_name = ''
        title = ''
        MINIMUM_NAME_LENGTH = len('Hon _ _')

        for line in (x.decode('utf8') for x in lines):
            line = line.strip()
            if not line.strip():
                continue

            if (line.startswith('Comisi') or
                    line.startswith('COMISIONES') or
                    line.startswith('SECRETAR')):

                if comm:
                    # Joint committee rosters are not complete, unfortunately
                    if "Conjunta" not in comm_name:
                        yield comm
                    comm = None
                    comm_name = ''

                if not (line.startswith('COMISIONES') or
                        line.startswith('SECRETAR')):
                    comm_name = line

                    # Remove "Committee" from committee names
                    comm_name = (
                        comm_name.
                        replace(u"Comisión de ", "").
                        replace(u"Comisión Especial para el Estudio de ", "").
                        replace(u"Comisión Especial para ", "")
                    )
                    comm_name = re.sub(r'(?u)^(las?|el|los)\s', "", comm_name)
                    comm_name = comm_name[0].upper() + comm_name[1:]

            # Committee president is always listed right after committee name
            elif (not comm and
                    comm_name and
                    not re.search(r'^(?:Co.)?President', line) and
                    not line.startswith('Miembr')):
                comm_name = comm_name + " " + line

            elif (not comm and
                    (re.search(r'^(?:Co.)?President', line) or
                     line.startswith('Miembr')) and
                    len(line) > len('Presidente ') + MINIMUM_NAME_LENGTH):
                comm = Organization(comm_name, chamber='upper',
                                    classification='committee')
                comm.add_source(url)

            if comm:
                assert re.search(r'(?u)Hon\.?\s\w', line)
                (temp_title, name) = line.split("Hon")
                name = name.strip(". ")

                if temp_title.strip():
                    title = temp_title

                    # Translate titles to English for parity with other states
                    if "President" in title:
                        title = 'chairman'
                    elif title.startswith("Vicepresident"):
                        title = 'vicechairman'
                    elif title.startswith("Secretari"):
                        title = 'secretary'
                    elif "Miembr" in title:
                        title = 'member'
                    else:
                        raise AssertionError("Unknown member type: {}".
                                             format(title))

                # Many of the ex-officio members have appended titles
                if ", " in name:
                    name = name.split(", ")[0]

                if name.lower() != 'vacante':
                    comm.add_member(name, title)

        if comm and "Conjunta" not in comm_name:
            yield comm

        os.remove(filename)
Beispiel #13
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'house' if chamber == 'lower' else 'senate'
        session_slug = {
            '62': '62-2011',
            '63': '63-2013',
            '64': '64-2015',
            '65': '65-2017',
        }[session]

        # Open the index page of the session's Registers, and open each
        url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % (
            session_slug, chamber_name)
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            # Initialize information about the vote parsing
            results = {}
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""
            bills = []

            # Determine which URLs the information was pulled from
            pdf_url = pdf.attrib['href']

            try:
                (path, response) = self.urlretrieve(pdf_url)
            except requests.exceptions.ConnectionError:
                continue

            # Convert the PDF to text
            data = convert_pdf(path, type='text').decode('utf-8')
            os.unlink(path)

            # Determine the date of the document
            date = re.findall(date_re, data)
            if date:
                date = date[0][0]
                cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y")
            else:
                # If no date is found anywhere, do not process the document
                self.warning("No date was found for the document; skipping.")
                continue

            # Check each line of the text for motion and vote information
            lines = data.splitlines()
            for line in lines:
                # Ignore lines with no information
                if re.search(chamber_re, line) or \
                        re.search(date_re, line) or \
                        re.search(page_re, line) or \
                        line.strip() == "":
                    pass

                # Ensure that motion and vote capturing are not _both_ active
                elif in_motion and in_vote:
                    raise AssertionError(
                        "Scraper should not be simultaneously processing " +
                        "motion name and votes, as it is for this motion: " +
                        cur_motion)

                # Start capturing motion text after a ROLL CALL header
                elif not in_motion and not in_vote:
                    if line.strip() == "ROLL CALL":
                        in_motion = True

                elif in_motion and not in_vote:
                    if cur_motion == "":
                        cur_motion = line.strip()
                    else:
                        cur_motion = cur_motion + " " + line.strip()

                    # ABSENT AND NOT VOTING marks the end of each motion name
                    # In this case, prepare to capture votes
                    if line.strip().endswith("VOTING") or \
                            line.strip().endswith("VOTING."):
                        in_motion = False
                        in_vote = True

                elif not in_motion and in_vote:
                    # Ignore appointments and confirmations
                    if "The Senate advises and consents to the appointment" \
                            in line:
                        in_vote = False
                        cur_vote = None
                        results = {}
                        cur_motion = ""
                        bills = []

                    # If votes are being processed, record the voting members
                    elif ":" in line:
                        cur_vote, who = (x.strip() for x in line.split(":", 1))
                        who = [
                            x.strip() for x in who.split(';')
                            if x.strip() != ""
                        ]
                        results[cur_vote] = who

                        name_may_be_continued = False if line.endswith(";") \
                            else True

                    # Extracts bill numbers in the closing text
                    # used for when the closing text is multiple lines.
                    elif cur_vote is not None and\
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and \
                            not any(x in line.lower() for x in ['passed', 'adopted',
                                                                'sustained', 'prevailed',
                                                                'lost', 'failed']):
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))

                    elif cur_vote is not None and \
                            not any(x in line.lower() for x in ['passed', 'adopted',
                                                                'sustained', 'prevailed',
                                                                'lost', 'failed']):
                        who = [
                            x.strip() for x in line.split(";")
                            if x.strip() != ""
                        ]

                        if name_may_be_continued:
                            results[cur_vote][-1] = results[cur_vote][-1] + \
                                    " " + who.pop(0)

                        name_may_be_continued = False if line.endswith(";") \
                            else True

                        results[cur_vote].extend(who)

                    # At the conclusion of a vote, save its data
                    elif any(x in line.lower() for x in [
                            'passed', 'adopted', 'sustained', 'prevailed',
                            'lost', 'failed'
                    ]):

                        in_vote = False
                        cur_vote = None

                        # Identify what is being voted on
                        # Throw a warning if impropper informaiton found
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))
                        if bills == [] or cur_motion.strip() == "":
                            results = {}
                            cur_motion = ""
                            self.warning("No motion or bill name found: " +
                                         "motion name: " + cur_motion + "; " +
                                         "decision text: " + line.strip())
                            continue

                        # If votes are found in the motion name, throw an error
                        if "YEAS:" in cur_motion or "NAYS:" in cur_motion:
                            raise AssertionError(
                                "Vote data found in motion name: " +
                                cur_motion)

                        # Use the collected results to determine who voted how
                        keys = {
                            "YEAS": "yes",
                            "NAYS": "no",
                            "ABSENT AND NOT VOTING": "other"
                        }
                        res = {}
                        for key in keys:
                            if key in results:
                                res[keys[key]] = results[key]
                            else:
                                res[keys[key]] = []

                        # Count the number of members voting each way
                        yes, no, other = \
                            len(res['yes']), \
                            len(res['no']), \
                            len(res['other'])
                        chambers = {
                            "H": "lower",
                            "S": "upper",
                            "J": "legislature"
                        }

                        # Almost all of the time, a vote only applies to one bill and this loop
                        # will only be run once.
                        # Some exceptions exist.

                        for bill in bills:

                            cur_bill_id = "%s%s%s %s" % bill

                            # Identify the source chamber for the bill
                            try:
                                bc = chambers[cur_bill_id[0]]
                            except KeyError:
                                bc = 'other'

                            # Determine whether or not the vote passed
                            if "over the governor's veto" in cur_motion.lower(
                            ):
                                VETO_SUPERMAJORITY = 2 / 3
                                passed = (yes /
                                          (yes + no) > VETO_SUPERMAJORITY)
                            else:
                                passed = (yes > no)
                            # Create a Vote object based on the scraped information
                            vote = Vote(
                                chamber=chamber,
                                start_date=cur_date.strftime('%Y-%m-%d'),
                                motion_text=cur_motion,
                                result='pass' if passed else 'fail',
                                legislative_session=session,
                                classification='passage',
                                bill=cur_bill_id,
                                bill_chamber=bc)

                            vote.add_source(pdf_url)
                            vote.add_source(url)
                            vote.set_count('yes', yes)
                            vote.set_count('no', no)
                            vote.set_count('other', other)
                            # For each category of voting members,
                            # add the individuals to the Vote object
                            for key in res:
                                for voter in res[key]:
                                    vote.vote(key, voter)

                            # Check the vote counts in the motion text against
                            # the parsed results
                            for category_name in keys.keys():
                                # Need to search for the singular, not plural, in the text
                                # so it can find, for example,  " 1 NAY "
                                vote_re = r"(\d+)\s{}".format(
                                    category_name[:-1])
                                motion_count = int(
                                    re.findall(vote_re, cur_motion)[0])

                                for item in vote.counts:
                                    if item['option'] == keys[category_name]:
                                        vote_count = item['value']

                                if motion_count != vote_count:
                                    self.warning(
                                        "Motion text vote counts ({}) ".format(
                                            motion_count) +
                                        "differed from roll call counts ({}) ".
                                        format(vote_count) +
                                        "for {0} on {1}".format(
                                            category_name, cur_bill_id))

                                    for item in vote.counts:
                                        if item['option'] == keys[
                                                category_name]:
                                            vote_count = motion_count

                            yield vote

                        # With the vote successfully processed,
                        # wipe its data and continue to the next one
                        results = {}
                        cur_motion = ""
                        bills = []
Beispiel #14
0
 def pdf_to_lxml(self, filename, type='html'):
     text = convert_pdf(filename, type)
     return lxml.html.fromstring(text)
Beispiel #15
0
    def scrape_vote(self, url, session):
        fname, _ = self.urlretrieve(url)
        text = convert_pdf(fname, type="text").decode()
        lines = text.splitlines()

        chamber = "upper" if "senate" in url else "lower"
        if "Maryland" not in text:
            self.warning(f"empty vote from {url}")
            return
        date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0]

        section = "preamble"
        motion = None
        bill_id = None
        how = None
        voters = defaultdict(list)

        for line in lines:
            if section == "preamble":
                possible_bill_id = re.findall(r"([HS][BJR] \d+)", line)
                if possible_bill_id:
                    bill_id = possible_bill_id[0]

                # preamble has metadata, then motion, then counts.  our process then is to
                # store the last line as the motion, but if the last line looks like a
                # continuation, append it to the prior line

                line = line.strip()
                counts = re.findall(
                    r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent",
                    line,
                )
                if counts:
                    yes_count, no_count, nv_count, excused_count, absent_count = counts[
                        0
                    ]
                    yes_count = int(yes_count)
                    no_count = int(no_count)
                    nv_count = int(nv_count)
                    excused_count = int(excused_count)
                    absent_count = int(absent_count)
                    section = "votes"
                elif line and line != "(Const)":
                    # questions seem to be split across two lines
                    if line.endswith("?"):
                        motion = motion + " " + line
                    else:
                        motion = line
            elif section == "votes":
                if line.startswith("Voting Yea"):
                    how = "yes"
                elif line.startswith("Voting Nay"):
                    how = "no"
                elif line.startswith("Not Voting"):
                    how = "not voting"
                elif line.startswith("Excused from Voting"):
                    how = "excused"
                elif line.startswith("Excused (Absent)"):
                    how = "absent"
                elif how:
                    names = re.split(r"\s{2,}", line)
                    voters[how].extend(names)

        if not bill_id and not motion:
            return
        elif bill_id and not motion:
            self.warning(f"got {bill_id} but no motion, not registering as a vote")
        elif motion and not bill_id:
            self.warning(f"got {motion} but no bill_id, not registering as a vote")
            return

        # bleh - result not indicated anywhere
        result = "pass" if yes_count > no_count else "fail"
        bill_chamber = "upper" if bill_id.startswith("S") else "lower"
        date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d")
        vote = VoteEvent(
            chamber=chamber,
            start_date=date,
            result=result,
            classification="passage",
            motion_text=motion,
            legislative_session=session,
            bill=bill_id,
            bill_chamber=bill_chamber,
        )
        # URL includes sequence ID, will be unique
        vote.pupa_id = url
        vote.add_source(url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("not voting", nv_count)
        vote.set_count("excused", excused_count)
        vote.set_count("absent", absent_count)
        for how, names in voters.items():
            for name in names:
                name = name.strip().replace("*", "")
                if name and "COPY" not in name and "Indicates Vote Change" not in name:
                    vote.vote(how, name)
        check_counts(vote, raise_error=True)
        return vote
 def pdf_to_lxml(self):
     filename, resp = self.scraper.urlretrieve(self.url)
     text = convert_pdf(filename, 'html')
     return lxml.html.fromstring(text)
Beispiel #17
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'house' if chamber == 'lower' else 'senate'
        session_slug = {
                '62': '62-2011',
                '63': '63-2013',
                '64': '64-2015',
                '65': '65-2017',
                '66': '66-2019',
                }[session]

        # Open the index page of the session's Registers, and open each
        url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % (
            session_slug, chamber_name)
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            # Initialize information about the vote parsing
            results = {}
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""
            bills = []

            # Determine which URLs the information was pulled from
            pdf_url = pdf.attrib['href']

            try:
                (path, response) = self.urlretrieve(pdf_url)
            except requests.exceptions.ConnectionError:
                continue

            # Convert the PDF to text
            data = convert_pdf(path, type='text').decode('utf-8')
            os.unlink(path)

            # Determine the date of the document
            date = re.findall(date_re, data)
            if date:
                date = date[0][0]
                cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y")
            else:
                # If no date is found anywhere, do not process the document
                self.warning("No date was found for the document; skipping.")
                continue

            # Check each line of the text for motion and vote information
            lines = data.splitlines()
            for line in lines:
                # Ignore lines with no information
                if re.search(chamber_re, line) or \
                        re.search(date_re, line) or \
                        re.search(page_re, line) or \
                        line.strip() == "":
                    pass

                # Ensure that motion and vote capturing are not _both_ active
                elif in_motion and in_vote:
                    raise AssertionError(
                            "Scraper should not be simultaneously processing " +
                            "motion name and votes, as it is for this motion: " +
                            cur_motion
                            )

                # Start capturing motion text after a ROLL CALL header
                elif not in_motion and not in_vote:
                    if line.strip() == "ROLL CALL":
                        in_motion = True

                elif in_motion and not in_vote:
                    if cur_motion == "":
                        cur_motion = line.strip()
                    else:
                        cur_motion = cur_motion + " " + line.strip()

                    # ABSENT AND NOT VOTING marks the end of each motion name
                    # In this case, prepare to capture votes
                    if line.strip().endswith("VOTING") or \
                            line.strip().endswith("VOTING."):
                        in_motion = False
                        in_vote = True

                elif not in_motion and in_vote:
                    # Ignore appointments and confirmations
                    if "The Senate advises and consents to the appointment" \
                            in line:
                        in_vote = False
                        cur_vote = None
                        results = {}
                        cur_motion = ""
                        bills = []

                    # If votes are being processed, record the voting members
                    elif ":" in line:
                        cur_vote, who = (x.strip() for x in line.split(":", 1))
                        who = [x.strip() for x in who.split(';') if x.strip() != ""]
                        results[cur_vote] = who

                        name_may_be_continued = False if line.endswith(";") \
                            else True

                    # Extracts bill numbers in the closing text
                    # used for when the closing text is multiple lines.
                    elif cur_vote is not None and\
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and \
                            not any(x in line.lower() for x in ['passed', 'adopted',
                                                                'sustained', 'prevailed',
                                                                'lost', 'failed']):
                        bills.extend(re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))

                    elif cur_vote is not None and \
                            not any(x in line.lower() for x in ['passed', 'adopted',
                                                                'sustained', 'prevailed',
                                                                'lost', 'failed']):
                        who = [x.strip() for x in line.split(";") if x.strip() != ""]

                        if name_may_be_continued:
                            results[cur_vote][-1] = results[cur_vote][-1] + \
                                    " " + who.pop(0)

                        name_may_be_continued = False if line.endswith(";") \
                            else True

                        results[cur_vote].extend(who)

                    # At the conclusion of a vote, save its data
                    elif any(x in line.lower() for x in ['passed', 'adopted',
                                                         'sustained', 'prevailed',
                                                         'lost', 'failed']):

                        in_vote = False
                        cur_vote = None

                        # Identify what is being voted on
                        # Throw a warning if impropper informaiton found
                        bills.extend(re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))
                        if bills == [] or cur_motion.strip() == "":
                            results = {}
                            cur_motion = ""
                            self.warning(
                                    "No motion or bill name found: " +
                                    "motion name: " + cur_motion + "; " +
                                    "decision text: " + line.strip()
                                    )
                            continue

                        # If votes are found in the motion name, throw an error
                        if "YEAS:" in cur_motion or "NAYS:" in cur_motion:
                            raise AssertionError(
                                    "Vote data found in motion name: " +
                                    cur_motion
                                    )

                        # Use the collected results to determine who voted how
                        keys = {
                            "YEAS": "yes",
                            "NAYS": "no",
                            "ABSENT AND NOT VOTING": "other"
                        }
                        res = {}
                        for key in keys:
                            if key in results:
                                res[keys[key]] = results[key]
                            else:
                                res[keys[key]] = []

                        # Count the number of members voting each way
                        yes, no, other = \
                            len(res['yes']), \
                            len(res['no']), \
                            len(res['other'])
                        chambers = {
                            "H": "lower",
                            "S": "upper",
                            "J": "legislature"
                        }

                        # Almost all of the time, a vote only applies to one bill and this loop
                        # will only be run once.
                        # Some exceptions exist.

                        for bill in bills:

                            cur_bill_id = "%s%s%s %s" % bill

                            # Identify the source chamber for the bill
                            try:
                                bc = chambers[cur_bill_id[0]]
                            except KeyError:
                                bc = 'other'

                            # Determine whether or not the vote passed
                            if "over the governor's veto" in cur_motion.lower():
                                VETO_SUPERMAJORITY = 2 / 3
                                passed = (yes / (yes + no) > VETO_SUPERMAJORITY)
                            else:
                                passed = (yes > no)
                            # Create a Vote object based on the scraped information
                            vote = Vote(chamber=chamber,
                                        start_date=cur_date.strftime('%Y-%m-%d'),
                                        motion_text=cur_motion,
                                        result='pass' if passed else 'fail',
                                        legislative_session=session,
                                        classification='passage',
                                        bill=cur_bill_id,
                                        bill_chamber=bc)

                            vote.add_source(pdf_url)
                            vote.add_source(url)
                            vote.set_count('yes', yes)
                            vote.set_count('no', no)
                            vote.set_count('other', other)
                            # For each category of voting members,
                            # add the individuals to the Vote object
                            for key in res:
                                for voter in res[key]:
                                    vote.vote(key, voter)

                            # Check the vote counts in the motion text against
                            # the parsed results
                            for category_name in keys.keys():
                                # Need to search for the singular, not plural, in the text
                                # so it can find, for example,  " 1 NAY "
                                vote_re = r"(\d+)\s{}".format(category_name[:-1])
                                motion_count = int(re.findall(vote_re, cur_motion)[0])

                                for item in vote.counts:
                                    if item['option'] == keys[category_name]:
                                        vote_count = item['value']

                                if motion_count != vote_count:
                                    self.warning(
                                            "Motion text vote counts ({}) ".format(motion_count) +
                                            "differed from roll call counts ({}) ".format(
                                                vote_count) +
                                            "for {0} on {1}".format(category_name, cur_bill_id)
                                            )

                                    for item in vote.counts:
                                        if item['option'] == keys[category_name]:
                                            vote_count = motion_count

                            yield vote

                        # With the vote successfully processed,
                        # wipe its data and continue to the next one
                        results = {}
                        cur_motion = ""
                        bills = []
Beispiel #18
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, "text")
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)
        date = None

        for idx, line in enumerate(lines):
            line = line.rstrip().decode("utf-8")
            match = re.search(r"(\d+)/(\d+)/(\d{4,4})$", line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r"\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)", line)
            if match:
                motion = (lines[idx - 2].strip()).decode("utf-8")
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()
                ]

                exc_match = re.search(r"EXCUSED: (\d+)", line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith("ADOPTED") or line.endswith("PASSED"):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r"(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$", line)
            if match:
                vote_type = {
                    "YEAS": "yes",
                    "NAYS": "no",
                    "NOT VOTING": "other",
                    "EXCUSED": "other",
                    "PAIRED": "paired",
                }[match.group(1)]
                continue

            if vote_type == "paired":
                for part in line.split("   "):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(r"([^\(]+)\((YEA|NAY)\)",
                                               line).groups()
                    name = name.strip()
                    if pair_type == "YEA":
                        votes["yes"].append(name)
                    elif pair_type == "NAY":
                        votes["no"].append(name)
            elif vote_type:
                for name in line.split("   "):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)
        if date:
            vote = VoteEvent(
                chamber="lower",
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                classification="passage",
                bill=bill,
            )

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("other", other_count)
            vote.add_source(url)
            vote.pupa_id = url

            for key, values in votes.items():
                for value in values:
                    vote.vote(key, value)

            yield vote
        else:
            self.warning("Syntax Error/Warning using 'convert_pdf'")
Beispiel #19
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, 'text')
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)
        date = None

        for idx, line in enumerate(lines):
            line = line.rstrip().decode('utf-8')
            match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)',
                line)
            if match:
                motion = (lines[idx - 2].strip()).decode('utf-8')
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()]

                exc_match = re.search(r'EXCUSED: (\d+)', line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith('ADOPTED') or line.endswith('PASSED'):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$',
                line)
            if match:
                vote_type = {'YEAS': 'yes',
                             'NAYS': 'no',
                             'NOT VOTING': 'other',
                             'EXCUSED': 'other',
                             'PAIRED': 'paired'}[match.group(1)]
                continue

            if vote_type == 'paired':
                for part in line.split('   '):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(
                        r'([^\(]+)\((YEA|NAY)\)', line).groups()
                    name = name.strip()
                    if pair_type == 'YEA':
                        votes['yes'].append(name)
                    elif pair_type == 'NAY':
                        votes['no'].append(name)
            elif vote_type:
                for name in line.split('   '):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)
        if date:
            vote = Vote(chamber='lower',
                        start_date=date.strftime("%Y-%m-%d"),
                        motion_text=motion,
                        result='pass' if passed else 'fail',
                        classification='passage',
                        bill=bill)

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('other', other_count)
            vote.add_source(url)

            for key, values in votes.items():
                for value in values:
                    vote.vote(key, value)

            yield vote
        else:
            self.warning("Syntax Error/Warning using 'convert_pdf'")
Beispiel #20
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, 'text')
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)
        date = None

        for idx, line in enumerate(lines):
            line = line.rstrip().decode('utf-8')
            match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line)
            if match:
                motion = (lines[idx - 2].strip()).decode('utf-8')
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()
                ]

                exc_match = re.search(r'EXCUSED: (\d+)', line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith('ADOPTED') or line.endswith('PASSED'):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line)
            if match:
                vote_type = {
                    'YEAS': 'yes',
                    'NAYS': 'no',
                    'NOT VOTING': 'other',
                    'EXCUSED': 'other',
                    'PAIRED': 'paired'
                }[match.group(1)]
                continue

            if vote_type == 'paired':
                for part in line.split('   '):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(r'([^\(]+)\((YEA|NAY)\)',
                                               line).groups()
                    name = name.strip()
                    if pair_type == 'YEA':
                        votes['yes'].append(name)
                    elif pair_type == 'NAY':
                        votes['no'].append(name)
            elif vote_type:
                for name in line.split('   '):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)
        if date:
            vote = VoteEvent(chamber='lower',
                             start_date=date.strftime("%Y-%m-%d"),
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification='passage',
                             bill=bill)

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('other', other_count)
            vote.add_source(url)
            vote.pupa_id = url

            for key, values in votes.items():
                for value in values:
                    vote.vote(key, value)

            yield vote
        else:
            self.warning("Syntax Error/Warning using 'convert_pdf'")
Beispiel #21
0
    def scrape_upper_committee(self, url):
        filename, resp = self.urlretrieve(url)
        lines = convert_pdf(filename, 'text').split('\n')
        comm = None
        comm_name = ''
        title = ''
        MINIMUM_NAME_LENGTH = len('Hon _ _')

        for line in (x.decode('utf8') for x in lines):
            line = line.strip()
            if not line.strip():
                continue

            if (line.startswith('Comisi') or line.startswith('COMISIONES')
                    or line.startswith('SECRETAR')):

                if comm:
                    # Joint committee rosters are not complete, unfortunately
                    if "Conjunta" not in comm_name:
                        yield comm
                    comm = None
                    comm_name = ''

                if not (line.startswith('COMISIONES')
                        or line.startswith('SECRETAR')):
                    comm_name = line

                    # Remove "Committee" from committee names
                    comm_name = (comm_name.replace(
                        u"Comisión de ",
                        "").replace(u"Comisión Especial para el Estudio de ",
                                    "").replace(u"Comisión Especial para ",
                                                ""))
                    comm_name = re.sub(r'(?u)^(las?|el|los)\s', "", comm_name)
                    comm_name = comm_name[0].upper() + comm_name[1:]

            # Committee president is always listed right after committee name
            elif (not comm and comm_name
                  and not re.search(r'^(?:Co.)?President', line)
                  and not line.startswith('Miembr')):
                comm_name = comm_name + " " + line

            elif (not comm and (re.search(r'^(?:Co.)?President', line)
                                or line.startswith('Miembr'))
                  and len(line) > len('Presidente ') + MINIMUM_NAME_LENGTH):
                comm = Organization(comm_name,
                                    chamber='upper',
                                    classification='committee')
                comm.add_source(url)

            if comm:
                assert re.search(r'(?u)Hon\.?\s\w', line)
                (temp_title, name) = line.split("Hon")
                name = name.strip(". ")

                if temp_title.strip():
                    title = temp_title

                    # Translate titles to English for parity with other states
                    if "President" in title:
                        title = 'chairman'
                    elif title.startswith("Vicepresident"):
                        title = 'vicechairman'
                    elif title.startswith("Secretari"):
                        title = 'secretary'
                    elif "Miembr" in title:
                        title = 'member'
                    else:
                        raise AssertionError(
                            "Unknown member type: {}".format(title))

                # Many of the ex-officio members have appended titles
                if ", " in name:
                    name = name.split(", ")[0]

                if name.lower() != 'vacante':
                    comm.add_member(name, title)

        if comm and "Conjunta" not in comm_name:
            yield comm

        os.remove(filename)