def test_vote_event_bill_actions_two_stage():
    # this test is very similar to what we're testing in test_vote_event_bill_actions w/
    # ve3 and ve4, that two bills that reference the same action won't conflict w/ the
    # OneToOneField, but in this case we do it in two stages so that the conflict is found
    # even if the votes weren't in the same scrape
    create_jurisdiction()
    bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", chamber="lower")

    bill.add_action(description="passage", date="1900-04-02", chamber="lower")

    ve1 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        chamber="lower",
    )
    ve2 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        chamber="lower",
    )
    # disambiguate them
    ve1.dedupe_key = "one"
    ve2.dedupe_key = "two"

    bi = BillImporter("jid")
    bi.import_data([bill.as_dict()])

    # first imports just fine
    VoteEventImporter("jid", bi).import_data([ve1.as_dict()])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 1
    assert votes[0].bill_action is not None

    # when second is imported, ensure that action stays pinned to first just as it would
    # have if they were both in same import
    VoteEventImporter("jid", bi).import_data([ve1.as_dict(), ve2.as_dict()])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 2
    assert votes[0].bill_action is not None
    assert votes[1].bill_action is None
Ejemplo n.º 2
0
def build_vote(session, bill_id, url, vote_record, chamber, motion_text):
    # When they vote in a substitute they mark it as XHB
    bill_id = bill_id.replace("XHB", "HB")
    passed = len(vote_record["yes"]) > len(vote_record["no"])
    vote_event = VoteEvent(
        result="pass" if passed else "fail",
        chamber=chamber,
        start_date=vote_record["date"].strftime("%Y-%m-%d"),
        motion_text=motion_text,
        classification="passage",
        legislative_session=session,
        bill=bill_id,
        bill_chamber="upper" if bill_id[0] == "S" else "lower",
    )
    vote_event.dedupe_key = url
    vote_event.set_count("yes", len(vote_record["yes"]))
    vote_event.set_count("no", len(vote_record["no"]))
    vote_event.set_count("excused", len(vote_record["excused"]))
    vote_event.set_count("absent", len(vote_record["absent"]))
    vote_event.set_count("other", len(vote_record["other"]))
    for vote_type in ["yes", "no", "excused", "absent", "other"]:
        for voter in vote_record[vote_type]:
            vote_event.vote(vote_type, voter)

    vote_event.add_source(url)
    return vote_event
Ejemplo n.º 3
0
    def parse_vote_page(self, vote_url, bill):
        vote_html = self.get(vote_url).text
        doc = lxml.html.fromstring(vote_html)
        # chamber
        if "senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        # date in the following format: Mar 23, 2009
        date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text
        date = date.replace("\xa0", " ")
        date = datetime.datetime.strptime(date[18:], "%b %d, %Y")

        # motion
        motion = "".join(x.text_content() for x in doc.xpath('//td[@colspan="23"]'))
        if motion == "":
            motion = "No motion given"  # XXX: Double check this. See SJ 3.
        motion = motion.replace("\xa0", " ")

        # totals
        tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class")
        totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:]
        yes_count = int(totals[0].split()[-1])
        no_count = int(totals[1].split()[-1])
        other_count = int(totals[2].split()[-1])
        other_count += int(totals[3].split()[-1])
        other_count += int(totals[4].split()[-1])
        passed = yes_count > no_count

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )
        vote.dedupe_key = vote_url  # contains sequence number
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)

        # go through, find Voting Yea/Voting Nay/etc. and next tds are voters
        func = None
        for td in doc.xpath("//td/text()"):
            td = td.replace("\xa0", " ")
            if td.startswith("Voting Yea"):
                func = vote.yes
            elif td.startswith("Voting Nay"):
                func = vote.no
            elif td.startswith("Not Voting"):
                func = vote.other
            elif td.startswith("Excused"):
                func = vote.other
            elif func:
                td = td.rstrip("*")
                func(td)

        return vote
Ejemplo n.º 4
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r"Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)", text)
        yes, no = int(votes[0][0]), int(votes[0][1])

        vtype = []
        for regex, type in motion_classifiers.items():
            if re.match(regex, text):
                vtype = type
                break

        v = VoteEvent(
            chamber=chamber,
            start_date=TIMEZONE.localize(date),
            motion_text=text,
            result="pass" if yes > no else "fail",
            classification=vtype,
            bill=bill,
        )
        v.dedupe_key = url.split("/")[-1]
        v.set_count("yes", yes)
        v.set_count("no", no)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if "av" in url:
                self.add_house_votes(v, url)
            elif "sv" in url:
                self.add_senate_votes(v, url)

        return v
Ejemplo n.º 5
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = VoteEvent(
            chamber="upper",
            start_date=date.strftime("%Y-%m-%d"),
            motion_text="Passage",
            # setting 'fail' for now.
            result="fail",
            classification="passage",
            bill=bill,
        )
        vote.add_source(url)
        vote.dedupe_key = url

        text = convert_pdf(filename, "text").decode("utf-8")
        os.remove(filename)

        if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url,
                                                    date)
            return

        data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1]
        data = list(filter(None, data))
        keymap = dict(yea="yes", nay="no")
        actual_vote = collections.defaultdict(int)
        vote_count = {"yes": 0, "no": 0, "other": 0}
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), "other")
            values = data.pop()
            for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values):
                if name.lower().strip() == "none.":
                    continue
                name = name.replace("..", "")
                name = re.sub(r"\.$", "", name)
                name = name.strip("-1234567890 \n")
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = ("pass" if vote_count["yes"] >
                       (vote_count["no"] + vote_count["other"]) else "fail")

        yield vote
def test_vote_event_dedupe_key_dedupe():
    j = create_jurisdiction()
    Organization.objects.create(id="org-id",
                                name="Legislature",
                                classification="legislature",
                                jurisdiction=j)

    vote_event = ScrapeVoteEvent(
        legislative_session="1900",
        start_date="2013",
        classification="anything",
        result="passed",
        motion_text="a vote on something",
        identifier="Roll Call No. 1",
    )
    vote_event.dedupe_key = "foo"

    bi = BillImporter("jid")
    _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict())
    assert what == "insert"
    assert VoteEvent.objects.count() == 1

    # same exact vote event, no changes
    _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict())
    assert what == "noop"
    assert VoteEvent.objects.count() == 1

    # new info, update
    vote_event.result = "failed"
    _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict())
    assert what == "update"
    assert VoteEvent.objects.count() == 1

    # new bill identifier, update
    vote_event.identifier = "First Roll Call"
    _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict())
    assert what == "update"
    assert VoteEvent.objects.count() == 1

    # new identifier, insert
    vote_event.dedupe_key = "bar"
    _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict())
    assert what == "insert"
    assert VoteEvent.objects.count() == 2
Ejemplo n.º 7
0
    def parse_committee_votes(self, bill, url):
        bill.add_source(url)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower"
        committee = tuple(doc.xpath("//h2")[0].itertext())[-2].strip()
        for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"):

            # Date
            for fmt in ("%m/%d/%Y", "%m-%d-%Y"):
                date = link.xpath("../../td")[0].text_content()
                try:
                    date = datetime.datetime.strptime(date, fmt)
                except ValueError:
                    continue
                break

            # Motion
            motion = link.text_content().split(" - ")[-1].strip()
            motion = "Committee vote (%s): %s" % (committee, motion)

            # Roll call
            vote_url = link.attrib["href"]
            rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url)

            vote = VoteEvent(
                chamber=chamber,
                start_date=tz.localize(date),
                motion_text=motion,
                classification=[],
                result="pass" if rollcall["passed"] else "fail",
                bill=bill,
            )
            vote.dedupe_key = vote_url
            vote.set_count("yes", rollcall["yes_count"])
            vote.set_count("no", rollcall["no_count"])
            vote.set_count("other", rollcall["other_count"])

            for voteval in ("yes", "no", "other"):
                for name in rollcall.get(voteval + "_votes", []):
                    vote.vote(voteval, name)

            vote.add_source(url)
            vote.add_source(vote_url)

            yield vote
Ejemplo n.º 8
0
    def scrape_votes(self, bill, bill_page, chamber):
        vote_links = bill_page.xpath(
            '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]'
        )
        for vote_link in vote_links:
            vote_url = vote_link.attrib["href"]
            date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td")
            date = datetime.strptime(date_td.text, "%b %d, %Y")
            motion_text = motion_td.text_content()
            vote_page = self.lxmlize(vote_url)
            passed = "Passed" in motion_text or "Advanced" in motion_text
            cells = vote_page.xpath(
                '//div[contains(@class,"table-responsive")]/table//td')
            vote = VoteEvent(
                bill=bill,
                chamber=chamber,
                start_date=TIMEZONE.localize(date),
                motion_text=motion_text,
                classification="passage",
                result="pass" if passed else "fail",
            )

            yes_count = self.process_count(vote_page, "Yes:")
            no_count = self.process_count(vote_page, "No:")
            exc_count = self.process_count(vote_page, "Excused - Not Voting:")
            absent_count = self.process_count(vote_page,
                                              "Absent - Not Voting:")
            present_count = self.process_count(vote_page,
                                               "Present - Not Voting:")

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", exc_count)
            vote.set_count("absent", absent_count)
            vote.set_count("abstain", present_count)

            query_params = urllib.parse.parse_qs(
                urllib.parse.urlparse(vote_url).query)
            vote.dedupe_key = query_params["KeyID"][0]
            vote.add_source(vote_url)
            for chunk in range(0, len(cells), 2):
                name = cells[chunk].text
                vote_type = cells[chunk + 1].text
                if name and vote_type:
                    vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), "other"),
                              name)
            yield vote
Ejemplo n.º 9
0
    def scrape_vote(self, chamber, session, bill_id, vote_url):
        try:
            resp = self.get(vote_url)
            html = resp.text
        except scrapelib.HTTPError:
            return

        doc = lxml.html.fromstring(html)
        motion = doc.xpath("//p[1]//b[1]/text()")[-1].strip()
        if len(motion) == 0:
            print(motion)
            motion = doc.xpath("//h2[1]/text()")[0].strip()

        vote_count = (
            doc.xpath("//h3[contains(text(),'YEA and ')]/text()")[0].strip().split()
        )
        yeas = int(vote_count[0])
        nays = int(vote_count[3])

        date = doc.xpath("//b[contains(text(),'Date:')]/../text()")[1].strip()
        date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

        vote = VoteEvent(
            chamber="lower",
            start_date=date,
            motion_text=motion,
            result="pass" if yeas > nays else "fail",
            classification="passage",
            legislative_session=session,
            bill=bill_id,
            bill_chamber=chamber,
        )
        vote.set_count("yes", yeas)
        vote.set_count("no", nays)
        vote.add_source(vote_url)
        vote.dedupe_key = vote_url

        # first table has YEAs
        for name in doc.xpath("//table[1]//font/text()"):
            vote.yes(name.strip())

        # second table is nays
        for name in doc.xpath("//table[2]//font/text()"):
            vote.no(name.strip())

        yield vote
Ejemplo n.º 10
0
    def scrape_votes(self, bill, page):
        base_url = "https://apps.azleg.gov/api/BillStatusFloorAction"
        for header in page["FloorHeaders"]:
            params = {
                "billStatusId": page["BillId"],
                "billStatusActionId": header["BillStatusActionId"],
                "includeVotes": "true",
            }
            resp = self.get(base_url, timeout=80, params=params)
            actions = json.loads(resp.content.decode("utf-8"))

            for action in actions:
                if action["Action"] == "No Action":
                    continue
                if action["ReportDate"] is None:
                    continue
                cleaned_date = action["ReportDate"].split(".")[0]
                action_date = datetime.datetime.strptime(
                    cleaned_date, "%Y-%m-%dT%H:%M:%S"
                )
                vote = VoteEvent(
                    chamber={"S": "upper", "H": "lower"}[header["LegislativeBody"]],
                    motion_text=action["Action"],
                    classification="passage",
                    result=(
                        "pass"
                        if action["UnanimouslyAdopted"]
                        or action["Ayes"] > action["Nays"]
                        else "fail"
                    ),
                    start_date=action_date.strftime("%Y-%m-%d"),
                    bill=bill,
                )
                vote.add_source(resp.url)
                vote.set_count("yes", action["Ayes"] or 0)
                vote.set_count("no", action["Nays"] or 0)
                vote.set_count("other", (action["Present"] or 0))
                vote.set_count("absent", (action["Absent"] or 0))
                vote.set_count("excused", (action["Excused"] or 0))
                vote.set_count("not voting", (action["NotVoting"] or 0))

                for v in action["Votes"]:
                    vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other")
                    vote.vote(vote_type, v["Legislator"]["FullName"])
                vote.dedupe_key = resp.url + str(action["ReferralNumber"])
                yield vote
Ejemplo n.º 11
0
    def scrape_chamber_votes(self, chamber, session):
        url = {
            "upper": "%s/%s" % (RI_URL_BASE, "SVotes"),
            "lower": "%s/%s" % (RI_URL_BASE, "HVotes"),
        }[chamber]
        action = "%s/%s" % (url, "votes.asp")
        dates = self.get_vote_dates(url, session)
        for date in dates:
            votes = self.parse_vote_page(self.post_to(action, date), url,
                                         session)
            for vote_dict in votes:
                for vote in vote_dict.values():
                    count = vote["count"]
                    chamber = {
                        "H": "lower",
                        "S": "upper"
                    }[vote["meta"]["chamber"]]

                    try:
                        bill_id = self._bill_id_by_type[(chamber,
                                                         vote["meta"]["bill"])]
                    except KeyError:
                        self.warning("no such bill_id %s %s", chamber,
                                     vote["meta"]["bill"])
                        continue

                    v = VoteEvent(
                        chamber=chamber,
                        start_date=vote["time"].strftime("%Y-%m-%d"),
                        motion_text=vote["meta"]["extra"]["motion"],
                        result="pass" if count["passage"] else "fail",
                        classification="passage",
                        legislative_session=session,
                        bill=bill_id,
                        bill_chamber=chamber,
                    )
                    v.set_count("yes", int(count["YEAS"]))
                    v.set_count("no", int(count["NAYS"]))
                    v.set_count("other", int(count["NOT VOTING"]))
                    v.add_source(vote["source"])
                    v.dedupe_key = vote["source"]

                    for vt in vote["votes"]:
                        key = {"Y": "yes", "N": "no"}.get(vt["vote"], "other")
                        v.vote(key, vt["name"])
                    yield v
Ejemplo n.º 12
0
    def asvote(self):
        v = VoteEvent(
            chamber=self.chamber(),
            start_date=self.date(),
            motion_text=self.motion(),
            result="pass" if self.passed() else "fail",
            classification="passage",
            bill=self.bill,
        )
        v.dedupe_key = self.url  # URL contains sequence number
        v.set_count("yes", self.yes_count())
        v.set_count("no", self.no_count())
        v.set_count("other", self.other_count())

        for voter in self.yes_votes():
            v.yes(voter)
        for voter in self.no_votes():
            v.no(voter)
        for voter in self.other_votes():
            v.vote("other", voter)
        v.add_source(self.url)
        return v
Ejemplo n.º 13
0
    def scrape_votes(self, url, motion, date, chamber, bill):
        try:
            vote_pdf, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("Can't find vote file {}, skipping".format(url))
            return

        text = convert_pdf(vote_pdf, "text")
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []
        absent_votes = []
        not_voting_votes = []
        # point at array to add names to
        cur_array = None

        precursors = (
            ("yeas--", yes_votes),
            ("nays--", no_votes),
            ("absent or those not voting--", absent_votes),
            ("absent and those not voting--", absent_votes),
            ("not voting--", not_voting_votes),
            ("voting present--", other_votes),
            ("present--", other_votes),
            ("disclaimer", None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.decode().split("\n"))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line.lower():
                    cur_array = arr
                    line = line.replace(pc, "")

            # split names
            for name in line.split(","):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if "None." in name:
                    cur_array = None

                match = re.match(r"(.+?)\. Total--.*", name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in (
                        "on final passage",
                        "Necessary",
                        "who would have",
                        "being a tie",
                        "therefore",
                        "Vacancies",
                        "a pair",
                        "Total-",
                        "ATTORNEY",
                        "on final passage",
                        "SPEAKER",
                        "BOARD",
                        "TREASURER",
                        "GOVERNOR",
                        "ARCHIVES",
                        "SECRETARY",
                ):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == ".":
                        name = name[:-1]
                    name = self.clean_voter_name(name)
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        absent_count = len(absent_votes)
        not_voting_count = len(not_voting_votes)
        other_count = len(other_votes)

        vote = VoteEvent(
            chamber=chamber,
            start_date=self._tz.localize(date),
            motion_text=motion,
            result="pass" if passed else "fail",
            classification="passage",
            bill=bill,
        )
        vote.dedupe_key = url + "#" + bill.identifier

        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("absent", absent_count)
        vote.set_count("not voting", not_voting_count)
        vote.set_count("other", other_count)
        vote.add_source(url)
        for yes_vote in yes_votes:
            vote.vote("yes", self.clean_voter_name(yes_vote))
        for no_vote in no_votes:
            vote.vote("no", self.clean_voter_name(no_vote))
        for absent_vote in absent_votes:
            vote.vote("absent", self.clean_voter_name(absent_vote))
        for not_voting_vote in not_voting_votes:
            vote.vote("not voting", self.clean_voter_name(not_voting_vote))
        for other_vote in other_votes:
            vote.vote("other", self.clean_voter_name(other_vote))
        yield vote
Ejemplo n.º 14
0
    def scrape_assembly_votes(self, session, bill, assembly_url, bill_id):

        # parse the bill data page, finding the latest html text
        url = assembly_url + "&Floor%26nbspVotes=Y"

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        if "Votes:" in doc.text_content():
            vote_motions = []
            additional_votes_on_motion = 2
            for table in doc.xpath("//table"):

                date = table.xpath('caption/span[contains(., "DATE:")]')
                date = next(date[0].itersiblings()).text
                date = datetime.datetime.strptime(date, "%m/%d/%Y")
                date = eastern.localize(date)
                date = date.isoformat()

                spanText = table.xpath("caption/span/text()")
                motion = spanText[2].strip() + spanText[3].strip()
                if motion in vote_motions:
                    motion = motion + f" - Vote {additional_votes_on_motion}"
                    additional_votes_on_motion += 1
                else:
                    vote_motions.append(motion)

                votes = (
                    table.xpath("caption/span/span")[0].text.split(":")[1].split("/")
                )
                yes_count, no_count = map(int, votes)
                passed = yes_count > no_count
                vote = VoteEvent(
                    chamber="lower",
                    start_date=date,
                    motion_text=motion,
                    bill=bill,
                    result="pass" if passed else "fail",
                    classification="passage",
                )

                vote.set_count("yes", yes_count)
                vote.set_count("no", no_count)
                absent_count = 0
                excused_count = 0
                tds = table.xpath("tr/td/text()")
                votes = [tds[i : i + 2] for i in range(0, len(tds), 2)]

                vote_dictionary = {
                    "Y": "yes",
                    "NO": "no",
                    "ER": "excused",
                    "AB": "absent",
                    "NV": "not voting",
                    "EL": "other",
                }

                for vote_pair in votes:
                    name, vote_val = vote_pair
                    vote.vote(vote_dictionary[vote_val], name)
                    if vote_val == "AB":
                        absent_count += 1
                    elif vote_val == "ER":
                        excused_count += 1

                vote.set_count("absent", absent_count)
                vote.set_count("excused", excused_count)
                vote.add_source(url)
                vote.dedupe_key = url + motion + spanText[1]

                yield vote
Ejemplo n.º 15
0
    def scrape_vote_history(self, bill, vurl):
        """
         Obtain the information on a vote and link it to the related Bill
        :param bill: related bill
        :param vurl: source for the voteEvent information.
        :return: voteEvent object
        """
        html = self.get(vurl).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(vurl)

        # skip first two rows
        for row in doc.xpath("//table/tr")[2:]:
            tds = row.getchildren()
            if len(tds) != 11:
                self.warning("irregular vote row: %s" % vurl)
                continue
            (
                timestamp,
                motion,
                vote,
                yeas,
                nays,
                nv,
                exc,
                pres,
                abst,
                total,
                result,
            ) = tds

            timestamp = timestamp.text.replace("\xa0", " ")
            timestamp = datetime.datetime.strptime(timestamp, "%m/%d/%Y %H:%M %p")

            yeas = int(yeas.text)
            nays = int(nays.text)
            others = int(nv.text) + int(exc.text) + int(abst.text) + int(pres.text)
            assert yeas + nays + others == int(total.text)

            if result.text == "Passed":
                passed = "pass"
            else:
                passed = "fail"

            vote_link = vote.xpath("a")[0]
            if "[H]" in vote_link.text:
                chamber = "lower"
            else:
                chamber = "upper"

            vote = VoteEvent(
                chamber=chamber,  # 'upper' or 'lower'
                start_date=timestamp.strftime("%Y-%m-%d"),  # 'YYYY-MM-DD' format
                motion_text=motion.text,
                result=passed,
                classification="passage",  # Can also be 'other'
                # Provide a Bill instance to link with the VoteEvent...
                bill=bill,
            )

            vote.set_count("yes", yeas)
            vote.set_count("no", nays)
            vote.set_count("other", others)

            vote.add_source(vurl)

            # obtain vote rollcall from pdf and add it to the VoteEvent object
            rollcall_pdf = vote_link.get("href")
            self.scrape_rollcall(vote, rollcall_pdf)
            vote.add_source(rollcall_pdf)
            if rollcall_pdf in self._seen_vote_ids:
                self.warning("duplicate usage of %s, skipping", rollcall_pdf)
                continue
            else:
                self._seen_vote_ids.add(rollcall_pdf)
            vote.dedupe_key = rollcall_pdf  # distinct KEY for each one

            yield vote
Ejemplo n.º 16
0
    def scrape_action_page(self, bill, page):
        action_rows = page.xpath("//tbody/tr")
        for row in action_rows:
            action_date = row.xpath("td[1]/text()")[0]
            action_date = datetime.strptime(action_date, "%m/%d/%Y")
            action_year = action_date.year
            action_date = action_date.strftime("%Y-%m-%d")

            if row.xpath("td[2]/text()"):
                action_actor = row.xpath("td[2]/text()")[0]
                action_actor = self.chamber_map_reverse[action_actor.strip()]

            action_name = row.xpath("string(td[3])")

            # House votes
            if "Supplement" in action_name:
                actor = "lower"

                if not re.findall(r"(.+)-\s*\d+\s*YEAS", action_name):
                    self.warning(
                        "vote {} did not match regex, skipping".format(
                            action_name))
                    continue

                vote_action = re.findall(r"(.+)-\s*\d+\s*YEAS",
                                         action_name)[0].strip()

                y = int(re.findall(r"(\d+)\s*YEAS", action_name)[0])
                n = int(re.findall(r"(\d+)\s*NAYS", action_name)[0])

                # get supplement number
                n_supplement = int(
                    re.findall(r"No\.\s*(\d+)", action_name, re.IGNORECASE)[0])
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result="pass" if y > n else "fail",
                    classification="passage",
                    bill=bill,
                )
                cached_vote.set_count("yes", y)
                cached_vote.set_count("no", n)

                housevote_pdf = (
                    "https://malegislature.gov/Journal/House/{}/{}/RollCalls".
                    format(bill.legislative_session, action_year))
                self.scrape_house_vote(cached_vote, housevote_pdf,
                                       n_supplement)
                cached_vote.add_source(housevote_pdf)

                cached_vote.dedupe_key = "{}#{}".format(
                    housevote_pdf, n_supplement)

                # XXX: disabled house votes on 8/1 to try to get MA importing again
                # will leaving this in and commented out once we resolve the ID issue
                # yield cached_vote

            # Senate votes
            if "Roll Call" in action_name:
                actor = "upper"
                # placeholder
                vote_action = action_name.split(" -")[0]
                # 2019 H86 Breaks our regex,
                # Ordered to a third reading --
                # see Senate   Roll Call #25 and House Roll Call 56
                if "yeas" in action_name and "nays" in action_name:
                    try:
                        y, n = re.search(r"(\d+) yeas .*? (\d+) nays",
                                         action_name.lower()).groups()
                        y = int(y)
                        n = int(n)
                    except AttributeError:
                        y = int(
                            re.search(r"yeas\s+(\d+)",
                                      action_name.lower()).group(1))
                        n = int(
                            re.search(r"nays\s+(\d+)",
                                      action_name.lower()).group(1))

                    # TODO: other count isn't included, set later
                    cached_vote = VoteEvent(
                        chamber=actor,
                        start_date=action_date,
                        motion_text=vote_action,
                        result="pass" if y > n else "fail",
                        classification="passage",
                        bill=bill,
                    )
                    cached_vote.set_count("yes", y)
                    cached_vote.set_count("no", n)

                    rollcall_pdf = "http://malegislature.gov" + row.xpath(
                        "string(td[3]/a/@href)")
                    self.scrape_senate_vote(cached_vote, rollcall_pdf)
                    cached_vote.add_source(rollcall_pdf)
                    cached_vote.dedupe_key = rollcall_pdf
                    # XXX: also disabled, see above note
                    # yield cached_vote

            attrs = self.categorizer.categorize(action_name)
            action = bill.add_action(
                action_name.strip(),
                action_date,
                chamber=action_actor,
                classification=attrs["classification"],
            )
            for com in attrs.get("committees", []):
                com = com.strip()
                action.add_related_entity(com, entity_type="organization")
Ejemplo n.º 17
0
    def scrape_vote(self, bill, vote_id, session):
        vote_url = (
            "https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId"
        )
        form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""}

        self.info("Fetching vote {} for {}".format(vote_id, bill.identifier))
        page = self.post(url=vote_url, data=form, allow_redirects=True).json()
        if page:
            roll = page["Model"]
            vote_chamber = self.chamber_map[roll["ChamberName"]]
            # "7/1/16 01:00 AM"
            vote_date = dt.datetime.strptime(
                roll["TakenAtDateTime"],
                "%m/%d/%y %I:%M %p").strftime("%Y-%m-%d")

            # TODO: What does this code mean?
            vote_motion = roll["RollCallVoteType"]

            vote_passed = "pass" if roll[
                "RollCallStatus"] == "Passed" else "fail"
            other_count = (int(roll["NotVotingCount"]) +
                           int(roll["VacantVoteCount"]) +
                           int(roll["AbsentVoteCount"]) +
                           int(roll["ConflictVoteCount"]))
            vote = VoteEvent(
                chamber=vote_chamber,
                start_date=vote_date,
                motion_text=vote_motion,
                result=vote_passed,
                bill=bill,
                legislative_session=session,
                classification=[],
            )
            vote_pdf_url = ("https://legis.delaware.gov"
                            "/json/RollCallController/GenerateRollCallPdf"
                            "?rollCallId={}&chamberId={}".format(
                                vote_id, self.chamber_codes[vote_chamber]))
            # Vote URL is just a generic search URL with POSTed data,
            # so provide a different link
            vote.add_source(vote_pdf_url)
            vote.dedupe_key = vote_pdf_url
            vote.set_count("yes", roll["YesVoteCount"])
            vote.set_count("no", roll["NoVoteCount"])
            vote.set_count("other", other_count)

            for row in roll["AssemblyMemberVotes"]:
                # AssemblyMemberId looks like it should work here,
                # but for some sessions it's bugged to only return session
                try:
                    voter = self.legislators_by_short[str(row["ShortName"])]
                    name = voter["DisplayName"]
                except KeyError:
                    self.warning("could not find legislator short name %s",
                                 row["ShortName"])
                    name = row["ShortName"]
                if row["SelectVoteTypeCode"] == "Y":
                    vote.yes(name)
                elif row["SelectVoteTypeCode"] == "N":
                    vote.no(name)
                else:
                    vote.vote("other", name)

            yield vote
Ejemplo n.º 18
0
    def parse_roll_call(self, bill, link, chamber, date):
        url = link.attrib["href"]
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        xpath = 'string(//div[@class="Column-OneFourth"]/div[3])'
        motion = page.xpath(xpath).strip()
        motion = re.sub(r"\s+", " ", motion)

        if motion == "FP":
            motion = "FINAL PASSAGE"

        if motion == "FINAL PASSAGE":
            type = "passage"
        elif re.match(r"CONCUR(RENCE)? IN \w+ AMENDMENTS", motion):
            type = "amendment"
        else:
            type = []
            motion = link.text_content()

        # Looks like for "YEAS" and "NAYS" counts, PA has multiple HTML
        # formats: one where the "YEAS" text node is nested within a span
        # element, and another where the text node is a direct child of the div
        # element
        yeas_elements = page.xpath("//div/span[text() = 'YEAS']/..")
        if len(yeas_elements) == 0:
            yeas_elements = page.xpath("//div[text()[normalize-space() = 'YEAS']]")
        yeas = int(yeas_elements[0].getnext().text)

        nays_elements = page.xpath("//div/span[text() = 'NAYS']/..")
        if len(nays_elements) == 0:
            nays_elements = page.xpath("//div[text()[normalize-space() = 'NAYS']]")
        nays = int(nays_elements[0].getnext().text)

        # "LVE" and "N/V" have been moved up as direct children of the div
        # element
        other = 0
        lve_elements = page.xpath('//div[text()[normalize-space() = "LVE"]]')
        if lve_elements:
            other += int(lve_elements[0].getnext().text)
        nv_elements = page.xpath('//div[text()[normalize-space() = "N/V"]]')
        if nv_elements:
            other += int(nv_elements[0].getnext().text)

        vote = VoteEvent(
            chamber=chamber,
            start_date=tz.localize(date),
            motion_text=motion,
            classification=type,
            result="pass" if yeas > (nays + other) else "fail",
            bill=bill,
        )
        # dedupe_key situation here is a bit weird, same vote can be used for
        # multiple bills see:
        # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11       # noqa
        # so we toss the bill id onto the end of the URL
        vote.dedupe_key = url + "#" + bill.identifier
        vote.add_source(url)
        vote.set_count("yes", yeas)
        vote.set_count("no", nays)
        vote.set_count("other", other)

        for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'):
            name = div[0].tail.strip()
            name = re.sub(r"^[\s,]+", "", name)
            name = re.sub(r"[\s,]+$", "", name)
            class_attr = div.attrib["class"].lower()
            if "yea" in class_attr:
                voteval = "yes"
            elif "nay" in class_attr:
                voteval = "no"
            elif "nvote" in class_attr:
                voteval = "other"
            elif "lve" in class_attr:
                voteval = "other"
            else:
                msg = "Unrecognized vote val: %s" % class_attr
                raise Exception(msg)
            vote.vote(voteval, name)

        return vote
Ejemplo n.º 19
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " "))

        seen_rcs = set()

        re_ns = "http://exslt.org/regular-expressions"
        path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={"re": re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if "HOUSE" in header.xpath("string()"):
                chamber = "lower"
                motion_index = 8
            else:
                chamber = "upper"
                motion_index = 13

            motion = header.xpath("string(following-sibling::p[%d])" %
                                  motion_index).strip()
            motion = re.sub(r"\s+", " ", motion)
            if not motion.strip():
                self.warning("Motion text not found")
                return
            match = re.match(r"^(.*) (PASSED|FAILED)$", motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == "PASSED"
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ")
            rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1)

            if rcs in seen_rcs:
                continue
            else:
                seen_rcs.add(rcs)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r"\d+/\d+/\d+", date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace("\r\n", " ").strip()
                if "*****" in line:
                    break
                regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL "
                         r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)")
                match = re.match(regex, line)
                if match:
                    if match.group(1) == "YEAS" and "RCS#" not in line:
                        vtype = "yes"
                        seen_yes = True
                    elif match.group(1) == "NAYS" and seen_yes:
                        vtype = "no"
                    elif match.group(1) == "VACANT":
                        continue  # skip these
                    elif seen_yes:
                        vtype = "other"
                    if seen_yes and match.group(3).strip():
                        self.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split("   "):
                        if not name:
                            continue
                        if "HOUSE" in name or "SENATE " in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts["yes"] > (counts["no"] + counts["other"])

            vote = Vote(
                chamber=chamber,
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.set_count("yes", counts["yes"])
            vote.set_count("no", counts["no"])
            vote.set_count("other", counts["other"])
            vote.dedupe_key = url + "#" + rcs

            vote.add_source(url)

            for name in votes["yes"]:
                vote.yes(name)
            for name in votes["no"]:
                if ":" in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes["other"]:
                vote.vote("other", name)

            yield vote
Ejemplo n.º 20
0
    def scrape_vote(self, url, session):
        fname, _ = self.urlretrieve(url)
        text = convert_pdf(fname, type="text").decode()
        lines = text.splitlines()

        chamber = "upper" if "senate" in url else "lower"
        if "Maryland" not in text:
            self.warning(f"empty vote from {url}")
            return
        date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0]

        section = "preamble"
        motion = None
        bill_id = None
        how = None
        voters = defaultdict(list)

        for line in lines:
            if section == "preamble":
                if "vetoed" in line.lower():
                    self.warning(
                        f"skipping vote that appears to be on prior session: {line}, {bill_id}"
                    )
                    return
                possible_bill_id = re.findall(r"([HS][BJR] \d+)", line)
                if possible_bill_id:
                    bill_id = possible_bill_id[0]

                # preamble has metadata, then motion, then counts.  our process then is to
                # store the last line as the motion, but if the last line looks like a
                # continuation, append it to the prior line

                line = line.strip()
                counts = re.findall(
                    r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent",
                    line,
                )
                if counts:
                    yes_count, no_count, nv_count, excused_count, absent_count = counts[
                        0]
                    yes_count = int(yes_count)
                    no_count = int(no_count)
                    nv_count = int(nv_count)
                    excused_count = int(excused_count)
                    absent_count = int(absent_count)
                    section = "votes"
                elif line and line != "(Const)":
                    # questions seem to be split across two lines
                    if line.endswith("?"):
                        motion = motion + " " + line
                    else:
                        motion = line
            elif section == "votes":
                if line.startswith("Voting Yea"):
                    how = "yes"
                elif line.startswith("Voting Nay"):
                    how = "no"
                elif line.startswith("Not Voting"):
                    how = "not voting"
                elif line.startswith("Excused from Voting"):
                    how = "excused"
                elif line.startswith("Excused (Absent)"):
                    how = "absent"
                elif how:
                    names = re.split(r"\s{2,}", line)
                    voters[how].extend(names)

        if not bill_id and not motion:
            return
        elif bill_id and not motion:
            self.warning(
                f"got {bill_id} but no motion, not registering as a vote")
        elif motion and not bill_id:
            self.warning(
                f"got {motion} but no bill_id, not registering as a vote")
            return

        # bleh - result not indicated anywhere
        result = "pass" if yes_count > no_count else "fail"
        bill_chamber = "upper" if bill_id.startswith("S") else "lower"
        date = datetime.datetime.strptime(date,
                                          "%b %d, %Y").strftime("%Y-%m-%d")
        vote = VoteEvent(
            chamber=chamber,
            start_date=date,
            result=result,
            classification="passage",
            motion_text=motion,
            legislative_session=session,
            bill=bill_id,
            bill_chamber=bill_chamber,
        )
        # URL includes sequence ID, will be unique
        vote.dedupe_key = url
        vote.add_source(url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("not voting", nv_count)
        vote.set_count("excused", excused_count)
        vote.set_count("absent", absent_count)
        for how, names in voters.items():
            for name in names:
                name = name.strip().replace("*", "")
                if name and "COPY" not in name and "Indicates Vote Change" not in name:
                    vote.vote(how, name)
        check_counts(vote, raise_error=True)
        return vote
Ejemplo n.º 21
0
    def parse_vote(self, bill, link):
        # Server sometimes sends proper error headers,
        # sometimes not
        try:
            self.info("Get {}".format(link))
            text = requests.get(link).text
        except requests.exceptions.HTTPError as err:
            self.warning("{} fetching vote {}, skipping".format(err, link))
            return

        if "Varnish cache server" in text:
            self.warning("Scrape rate is too high, try re-scraping with "
                         "The --rpm set to a lower number")
            return

        if "Page Not Found" in text or "Page Unavailable" in text:
            self.warning("missing vote, skipping")
            return
        member_doc = lxml.html.fromstring(text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        chamber_date_line = "".join(
            member_doc.xpath("//div[@id='main_content']/h3[1]//text()"))
        chamber_date_line_words = chamber_date_line.split()
        vote_chamber = chamber_date_line_words[0]
        vote_date = datetime.datetime.strptime(chamber_date_line_words[-1],
                                               "%m/%d/%Y")
        vote_status = " ".join(chamber_date_line_words[2:-2])
        opinions = member_doc.xpath(
            "//div[@id='main_content']/h3[position() > 1]/text()")
        if len(opinions) > 0:
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = "upper" if vote_chamber == "Senate" else "lower"

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except ValueError:
                    # This is likely not a vote-count text chunk
                    # It's probably '`On roll call the vote was:`
                    pass
                else:
                    if "yea" in i.lower():
                        yes_count = count
                    elif "nay" in i.lower():
                        no_count = count
                    elif "present" in i.lower():
                        p_count = count
                    elif "absent" in i.lower():
                        a_count = count

            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime("%Y-%m-%d"),
                chamber=vote_chamber,
                motion_text=vote_status,
                result="pass" if yes_count > no_count else "fail",
                classification="passage",
            )
            vote.dedupe_key = link

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("abstain", p_count)
            vote.set_count("absent", a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote("yes", re.sub(",", "", a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote("no", re.sub(",", "", a_links[i]).split()[0])
                else:
                    vote.vote("other", re.sub(",", "", a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Ejemplo n.º 22
0
    def scrape_votes(self, session):
        votes = {}
        other_counts = defaultdict(int)
        last_line = []
        vote_url = f"http://www.gencourt.state.nh.us/dynamicdatadump/RollCallSummary.txt?x={self.cachebreaker}"
        lines = self.get(vote_url).content.decode("utf-8").splitlines()

        for line in lines:

            if len(line) < 2:
                continue

            if line.strip() == "":
                continue

            line = line.split("|")
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning("used bad vote line")
                else:
                    last_line = line
                    self.warning("bad vote line %s" % "|".join(line))
            session_yr = line[0].replace("\xef\xbb\xbf", "")
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            # present = int(line[7])
            # absent = int(line[8])
            motion = line[11].strip() or "[not available]"

            if session_yr == session and bill_id in self.bills_by_id:
                actor = "lower" if body == "H" else "upper"
                time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p")
                time = pytz.timezone("America/New_York").localize(
                    time).isoformat()
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(
                    chamber=actor,
                    start_date=time,
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    classification="passage",
                    bill=self.bills_by_id[bill_id],
                )
                vote.set_count("yes", yeas)
                vote.set_count("no", nays)
                vote.add_source(vote_url)
                vote.dedupe_key = session_yr + body + vote_num  # unique ID for vote
                votes[body + vote_num] = vote

        for line in (self.get(
                f"http://www.gencourt.state.nh.us/dynamicdatadump/RollCallHistory.txt?x={self.cachebreaker}"
        ).content.decode("utf-8").splitlines()):
            if len(line) < 2:
                continue

            # 2016|H|2|330795||Yea|
            # 2012    | H   | 2    | 330795  | 964 |  HB309  | Yea | 1/4/2012 8:27:03 PM
            session_yr, body, v_num, _, employee, bill_id, vote, date = line.split(
                "|")

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = " ".join(self.legislators[employee]["name"].split())
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body + v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body + v_num))
                    continue
                # code = self.legislators[employee]['seat']

                if vote == "Yea":
                    votes[body + v_num].yes(leg)
                elif vote == "Nay":
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].vote("other", leg)
                    # hack-ish, but will keep the vote count sync'd
                    other_counts[body + v_num] += 1
                    votes[body + v_num].set_count("other",
                                                  other_counts[body + v_num])
        for vote in votes.values():
            yield vote
Ejemplo n.º 23
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type="text").decode()
        lines = text.splitlines()

        if "Senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        date_string = lines[0].split("Calendar Date:")[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if "Yeas" in line and "Nays" in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ["yes", "no", "not voting", "excused", "absent"]

        if page_index:

            counts = re.split(r"\s{2,}", lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(" ", 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ["Consent Calendar" in line for line in lines[:page_index]]
        )
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip())
            assert (
                consent_calendar_bills
            ), "Could not find bills for consent calendar vote"

        motion_keywords = [
            "favorable",
            "reading",
            "amendment",
            "motion",
            "introduced",
            "bill pass",
            "committee",
        ]
        motion_lines = [
            3,
            2,
            4,
            5,
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                break
            motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0]
        else:
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.dedupe_key = "{}#{}".format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            "Voting Nay",
            "Not Voting",
            "COPY",
            "Excused",
            "indicates vote change",
            "Indicates Vote Change",
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or "Voting Yea" in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line for show_stopper in show_stoppers):
                page_index += 1
                vote_index = vote_index + 1
                continue

            names = re.split(r"\s{2,}", current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Ejemplo n.º 24
0
    def scrape_pdf_for_votes(self, session, actor, date, motion, href):
        warned = False
        # vote indicator, a few spaces, a name, newline or multiple spaces
        # VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})')
        COUNT_RE = re.compile(
            r"^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$"
        )
        PASS_FAIL_WORDS = {
            "PASSED": "pass",
            "PREVAILED": "fail",
            "ADOPTED": "pass",
            "CONCURRED": "pass",
            "FAILED": "fail",
            "LOST": "fail",
        }

        pdflines = self.fetch_pdf_lines(href)

        if not pdflines:
            return False

        yes_count = no_count = present_count = 0
        yes_votes = []
        no_votes = []
        present_votes = []
        excused_votes = []
        not_voting = []
        absent_votes = []
        passed = None
        counts_found = False
        vote_lines = []
        for line in pdflines:
            # consider pass/fail as a document property instead of a result of the vote count
            # extract the vote count from the document instead of just using counts of names
            if not line.strip():
                continue
            elif line.strip() in PASS_FAIL_WORDS:
                # Crash on duplicate pass/fail status that differs from previous status
                if passed is not None and passed != PASS_FAIL_WORDS[line.strip()]:
                    raise Exception("Duplicate pass/fail matches in [%s]" % href)
                passed = PASS_FAIL_WORDS[line.strip()]
            elif COUNT_RE.match(line):
                (yes_count, no_count, present_count, not_voting_count) = COUNT_RE.match(
                    line
                ).groups()
                yes_count = int(yes_count)
                no_count = int(no_count)
                present_count = int(present_count)
                counts_found = True
            elif counts_found:
                for value in VOTE_VALUES:
                    if re.search(r"^\s*({})\s+\w".format(value), line):
                        vote_lines.append(line)
                        break

        votes = find_columns_and_parse(vote_lines)
        for name, vcode in votes.items():
            if name == "Mr. Speaker":
                name = session_details[session]["speaker"]
            elif name == "Mr. President":
                name = session_details[session]["president"]
            else:
                # Converts "Davis,William" to "Davis, William".
                name = re.sub(r"\,([a-zA-Z])", r", \1", name)

            if vcode == "Y":
                yes_votes.append(name)
            elif vcode == "N":
                no_votes.append(name)
            elif vcode == "P":
                present_votes.append(name)
            elif vcode == "E":
                excused_votes.append(name)
            elif vcode == "NV":
                not_voting.append(name)
            elif vcode == "A":
                absent_votes.append(name)

        # fake the counts
        if yes_count == 0 and no_count == 0 and present_count == 0:
            yes_count = len(yes_votes)
            no_count = len(no_votes)
        else:  # audit
            if yes_count != len(yes_votes):
                self.warning(
                    "Mismatched yes count [expect: %i] [have: %i]"
                    % (yes_count, len(yes_votes))
                )
                warned = True
            if no_count != len(no_votes):
                self.warning(
                    "Mismatched no count [expect: %i] [have: %i]"
                    % (no_count, len(no_votes))
                )
                warned = True

        if passed is None:
            if actor["classification"] == "lower":  # senate doesn't have these lines
                self.warning(
                    "No pass/fail word found; fall back to comparing yes and no vote."
                )
                warned = True
            passed = "pass" if yes_count > no_count else "fail"

        classification, _ = _categorize_action(motion)
        vote_event = VoteEvent(
            legislative_session=session,
            motion_text=motion,
            classification=classification,
            organization=actor,
            start_date=date,
            result=passed,
        )
        for name in yes_votes:
            vote_event.yes(name)
        for name in no_votes:
            vote_event.no(name)
        for name in present_votes:
            vote_event.vote("other", name)
        for name in excused_votes:
            vote_event.vote("excused", name)
        for name in not_voting:
            vote_event.vote("not voting", name)
        for name in absent_votes:
            vote_event.vote("absent", name)

        vote_event.set_count("yes", yes_count)
        vote_event.set_count("no", no_count)
        vote_event.set_count("other", present_count)
        vote_event.set_count("excused", len(excused_votes))
        vote_event.set_count("absent", len(absent_votes))
        vote_event.set_count("not voting", len(not_voting))

        vote_event.add_source(href)

        # for distinguishing between votes with the same id and on same day
        vote_event.dedupe_key = href

        if warned:
            self.warning("Warnings were issued. Best to check %s" % href)
        return vote_event
Ejemplo n.º 25
0
    def scrape_vote(self, session, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath("//font/text()")[2]
        except IndexError:
            self.warning("Vote Summary Page Broken ")
            return

        # eg. http://leg.colorado.gov/content/sb18-033vote563ce6
        if ("AM" in motion or "PM" in motion) and "/" in motion:
            motion = "Motion not given."

        if "withdrawn" not in motion:
            yes_no_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Aye')]]/font/text()")
            other_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Absent')]]/font/text()")
            abstain_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'17C')]]/font/text()")

            if not yes_no_counts:
                self.info("Missing yes no count")
                return

            yes_count = int(yes_no_counts[0])
            no_count = int(yes_no_counts[2])
            exc_count = int(other_counts[2])
            absent_count = int(other_counts[0])
            abstain_count = 0
            if abstain_counts:
                abstain_count = int(abstain_counts[0])

            # fix for
            # http://leg.colorado.gov/content/hb19-1029vote65e72e
            if absent_count == -1:
                absent_count = 0

            passed = yes_count > no_count
            vote = VoteEvent(
                chamber=chamber,
                start_date=self._tz.localize(date),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.dedupe_key = vote_url
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", exc_count)
            vote.set_count("absent", absent_count)
            vote.set_count("abstain", abstain_count)
            vote.add_source(vote_url)

            rolls = page.xpath("//tr[preceding-sibling::tr/descendant::"
                               "td/div/b/font[contains(text(),'Vote')]]")

            vote_abrv = {
                "Y": "yes",
                "N": "no",
                "E": "excused",
                "A": "absent",
                "-": "absent",
                "17C": "abstain",
            }
            for roll in rolls:
                if len(roll.xpath(".//td/div/font/text()")) > 0:
                    voted = roll.xpath(".//td/div/font/text()")[0].strip()
                    voter = roll.xpath(".//td/font/text()")[0].strip()
                    if voted == "V":
                        continue
                    vote.vote(vote_abrv[voted], voter)
            yield vote
Ejemplo n.º 26
0
    def scrape_bills(self, session, year_abr):
        # Main Bill information
        main_bill_csv = self.to_csv("MAINBILL.TXT")

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(
                bill_id,
                title=title,
                chamber=chamber,
                legislative_session=session,
                classification=self._bill_types[bill_type[1:]],
            )
            if rec["IdenticalBillNumber"].strip():
                bill.add_related_bill(
                    rec["IdenticalBillNumber"].split()[0],
                    legislative_session=session,
                    relation_type="companion",
                )

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_csv = self.to_csv("BILLSPON.TXT")

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in sponsor database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == "P":
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsorship(
                name,
                classification=sponsor_type,
                entity_type="person",
                primary=sponsor_type == "primary",
            )

        # Documents
        bill_document_csv = self.to_csv("BILLWP.TXT")

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in document database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split("\\")
            document = document[-2] + "/" + document[-1]

            htm_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format(
                year_abr, document.replace(".DOC", ".HTM"))
            pdf_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format(
                year_abr, document.replace(".DOC", ".PDF"))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec["DocType"]]
            except KeyError:
                raise Exception("unknown doctype %s on %s" %
                                (rec["DocType"], bill_id))
            if rec["Comment"]:
                doc_name += " " + rec["Comment"]

            # Clean links.
            if htm_url.endswith("HTMX"):
                htm_url = re.sub("X$", "", htm_url)
            if pdf_url.endswith("PDFX"):
                pdf_url = re.sub("X$", "", pdf_url)

            if rec["DocType"] in self._version_types:
                if htm_url.lower().endswith("htm"):
                    mimetype = "text/html"
                elif htm_url.lower().endswith("wpd"):
                    mimetype = "application/vnd.wordperfect"
                try:
                    bill.add_version_link(doc_name,
                                          htm_url,
                                          media_type=mimetype)
                    bill.add_version_link(doc_name,
                                          pdf_url,
                                          media_type="application/pdf")
                except ValueError:
                    self.warning(
                        "Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document_link(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            "A%s" % year_abr,
            "A%s" % next_year,
            "S%s" % year_abr,
            "S%s" % next_year,
            "CA%s-%s" % (year_abr, next_year),
            "CS%s-%s" % (year_abr, next_year),
        ]
        # keep votes clean globally, a few votes show up in multiple files
        votes = {}

        for filename in vote_info_list:
            s_vote_url = f"https://www.njleg.state.nj.us/votes/{filename}.zip"
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.HTTPError:
                self.warning("could not find %s" % s_vote_url)
                continue
            zippedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = io.TextIOWrapper(zippedfile.open(vfile, "r"),
                                                 encoding="latin-1")
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)
                if filename.startswith("A") or filename.startswith("CA"):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith("C"):
                    vote_file_type = "committee"
                else:
                    vote_file_type = "chamber"

                for rec in vdict_file:
                    if vote_file_type == "chamber":
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                        vote_parts = (bill_id, chamber, action)
                    else:
                        bill_id = "%s%s" % (rec["Bill_Type"],
                                            rec["Bill_Number"])
                        leg = rec["Name"]
                        # drop time portion
                        date = rec["Agenda_Date"].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec["BillAction"]]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec["LegislatorVote"][0:1]
                        committee = rec["Committee_House"]
                        vote_parts = (bill_id, chamber, action, committee)

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = "_".join(vote_parts).replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            chamber=chamber,
                            motion_text=action,
                            classification="passage",
                            result=None,
                            bill=bill_dict[bill_id],
                        )
                        votes[vote_id].dedupe_key = vote_id
                    if leg_vote == "Y":
                        votes[vote_id].vote("yes", leg)
                    elif leg_vote == "N":
                        votes[vote_id].vote("no", leg)
                    else:
                        votes[vote_id].vote("other", leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.values():
                counts = collections.defaultdict(int)
                for count in vote.votes:
                    counts[count["option"]] += 1
                vote.set_count("yes", counts["yes"])
                vote.set_count("no", counts["no"])
                vote.set_count("other", counts["other"])

                # Veto override.
                if vote.motion_text == "OVERRIDE":
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    if "lower" in vote.bill:
                        vote.result = "pass" if counts["yes"] >= 54 else "fail"
                    elif "upper" in vote.bill:
                        vote.result = "pass" if counts["yes"] >= 27 else "fail"
                else:
                    # Regular vote.
                    vote.result = "pass" if counts["yes"] > counts[
                        "no"] else "fail"

                vote.add_source("http://www.njleg.state.nj.us/downloads.asp")
                yield vote

        # Actions
        bill_action_csv = self.to_csv("BILLHIST.TXT")
        actor_map = {"A": "lower", "G": "executive", "S": "upper"}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in action database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = dateutil.parser.parse(date)
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += " " + comment
            bill.add_action(
                action,
                date=TIMEZONE.localize(date),
                classification=atype,
                chamber=actor,
            )

        # Subjects
        subject_csv = self.to_csv("BILLSUBJ.TXT")
        for rec in subject_csv:
            bill_id = rec["BillType"].strip() + str(int(rec["BillNumber"]))
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in subject database" % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.subject.append(rec["SubjectKey"])
            else:
                self.warning("invalid bill id in BillSubj: %s" % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.values():
            # add sources
            if not bill.actions and not bill.versions:
                self.warning("probable phony bill detected %s",
                             bill.identifier)
                phony_bill_count += 1
            else:
                bill.add_source("http://www.njleg.state.nj.us/downloads.asp")
                yield bill

        if phony_bill_count:
            self.warning("%s total phony bills detected", phony_bill_count)
Ejemplo n.º 27
0
    def process_vote(self, votes, url, base_url, bill, legislators,
                     chamber_dict, vote_results):
        for v in votes["items"]:
            try:
                v["yeas"]
            except KeyError:
                # sometimes the actual vote is buried a second layer deep
                v = self.get(base_url + v["link"]).json()
                try:
                    v["yeas"]
                except KeyError:
                    self.logger.warning("No vote info available, skipping")
                    continue

            try:
                chamber = chamber_dict[v["chamber"]]
            except KeyError:
                chamber = "lower" if "house" in v["apn"] else "upper"
            try:
                date = self._tz.localize(
                    datetime.datetime.strptime(v["date"], "%m/%d/%y"))
                date = "{:%Y-%m-%d}".format(date)
            except KeyError:
                try:
                    date = self._tz.localize(
                        datetime.datetime.strptime(v["occurred"], "%m/%d/%y"))
                    date = "{:%Y-%m-%d}".format(date)
                except KeyError:
                    self.logger.warning("No date found for vote, skipping")
                    continue
            try:
                motion = v["action"]
            except KeyError:
                motion = v["motiontype"]

            if motion in self._vote_motion_dict:
                motion_text = self._vote_motion_dict[motion]
            else:
                self.warning(
                    "Unknown vote code {}, please add to _vote_motion_dict".
                    format(motion))
                motion_text = v["results"]

            # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip
            if not motion and isinstance(v["yeas"], str) and isinstance(
                    v["nays"], str):
                waringText = 'Malformed JSON found for vote ("revno" of {}); skipping'
                self.warning(waringText.format(v["revno"]))
                continue

            result = v.get("results") or v.get("passed")
            if result is None:
                if len(v["yeas"]) > len(v["nays"]):
                    result = "passed"
                else:
                    result = "failed"

            passed = vote_results[result.lower()]
            if "committee" in v:
                vote = VoteEvent(
                    chamber=chamber,
                    start_date=date,
                    motion_text=motion_text,
                    result="pass" if passed else "fail",
                    # organization=v["committee"],
                    bill=bill,
                    classification="committee-passage",
                )
            else:
                vote = VoteEvent(
                    chamber=chamber,
                    start_date=date,
                    motion_text=motion_text,
                    result="pass" if passed else "fail",
                    classification="passage",
                    bill=bill,
                )
            # Concatenate the bill identifier and vote identifier to avoid collisions
            vote.dedupe_key = "{}:{}".format(bill.identifier.replace(" ", ""),
                                             v["revno"])
            # the yea and nay counts are not displayed, but vote totals are
            # and passage status is.
            yes_count = 0
            no_count = 0
            absent_count = 0
            excused_count = 0
            for voter_id in v["yeas"]:
                vote.yes(legislators[voter_id])
                yes_count += 1
            for voter_id in v["nays"]:
                vote.no(legislators[voter_id])
                no_count += 1
            if "absent" in v:
                for voter_id in v["absent"]:
                    vote.vote("absent", legislators[voter_id])
                    absent_count += 1
            if "excused" in v:
                for voter_id in v["excused"]:
                    vote.vote("excused", legislators[voter_id])
                    excused_count += 1

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("absent", absent_count)
            vote.set_count("excused", excused_count)
            # check to see if there are any other things that look
            # like vote categories, throw a warning if so
            for key, val in v.items():
                if (type(val) == list and len(val) > 0
                        and key not in ["yeas", "nays", "absent", "excused"]):
                    if val[0] in legislators:
                        self.logger.warning(
                            "{k} looks like a vote type that's not being counted."
                            " Double check it?".format(k=key))
            vote.add_source(url)

            yield vote
Ejemplo n.º 28
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url, timeout=80)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, "text")
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)
        date = None

        for idx, line in enumerate(lines):
            line = line.rstrip().decode("utf-8")
            match = re.search(r"(\d+)/(\d+)/(\d{4,4})$", line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r"\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)", line)
            if match:
                motion = (lines[idx - 2].strip()).decode("utf-8")
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()
                ]

                exc_match = re.search(r"EXCUSED: (\d+)", line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith("ADOPTED") or line.endswith("PASSED"):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r"(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$", line)
            if match:
                vote_type = {
                    "YEAS": "yes",
                    "NAYS": "no",
                    "NOT VOTING": "other",
                    "EXCUSED": "other",
                    "PAIRED": "paired",
                }[match.group(1)]
                continue

            if vote_type == "paired":
                for part in line.split("   "):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(r"([^\(]+)\((YEA|NAY)\)",
                                               line).groups()
                    name = name.strip()
                    if pair_type == "YEA":
                        votes["yes"].append(name)
                    elif pair_type == "NAY":
                        votes["no"].append(name)
            elif vote_type:
                for name in line.split("   "):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)
        if date:
            vote = VoteEvent(
                chamber="lower",
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                classification="passage",
                bill=bill,
            )

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("other", other_count)
            vote.add_source(url)
            vote.dedupe_key = url

            for key, values in votes.items():
                for value in values:
                    if "Committee" in value:
                        continue
                    if "*" in value:
                        value = value.replace("*", "")
                    vote.vote(key, value)

            yield vote
        else:
            self.warning("Syntax Error/Warning using 'convert_pdf'")
Ejemplo n.º 29
0
    def scrape_vote(self, bill, date, url):
        page = self.get(url).json()

        location = page["actionLog"]["FullName"]
        if location:
            if "House" in location:
                chamber = "lower"
            elif "Senate" in location:
                chamber = "upper"
            elif "Joint" in location:
                chamber = "legislature"
            else:
                self.warning("Bad Vote chamber: '%s', skipping" % location)
                return
        else:
            self.warning("Bad Vote chamber: '%s', skipping" % location)
            return

        motion = page["actionLog"]["StatusText"]
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = page["Yeas"]
            no_count = page["Nays"]
            excused_count = page["Excused"]
            absent_count = page["Absent"]

            passed = yes_count > no_count

            if motion.startswith("Do Pass"):
                vtype = "passage"
            elif motion == "Concurred in amendments":
                vtype = "amendment"
            # commenting out until we add these back to OS-core
            # elif motion == "Veto override":
            #     vtype = "veto-override"
            else:
                vtype = []

            vote = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=motion,
                result="pass" if passed else "fail",
                classification=vtype,
                bill=bill,
            )
            # differentiate nearly identical votes
            vote.dedupe_key = url

            vote.add_source(url)
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", excused_count)
            vote.set_count("absent", absent_count)

            for person in page["RollCalls"]:
                option = person["Vote1"]
                if option in ("Aye", "Yea"):
                    vote.yes(person["UniqueName"])
                elif option == "Nay":
                    vote.no(person["UniqueName"])
                elif option == "Excused":
                    vote.vote("excused", person["UniqueName"])
                elif option == "Absent":
                    vote.vote("absent", person["UniqueName"])

            yield vote
Ejemplo n.º 30
0
    def _parse_votes(self, url, vote, bill):
        """Given a vote url and a vote object, extract the voters and
        the vote counts from the vote page and update the vote object.
        """
        if url.lower().endswith(".pdf"):

            try:
                resp = self.get(url)
            except HTTPError:
                # This vote document wasn't found.
                msg = "No document found at url %r" % url
                self.logger.warning(msg)
                return

            try:
                v = PDFCommitteeVote(url, resp.content, bill)
                return v.asvote()
            except PDFCommitteeVoteParseError:
                # Warn and skip.
                self.warning("Could't parse committee vote at %r" % url)
                return

        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # Yes, no, excused, absent.
        try:
            vals = doc.xpath("//table")[1].xpath("tr/td/text()")
        except IndexError:
            # Most likely was a bogus link lacking vote data.
            return

        yes_count, no_count, excused_count, absent_count = map(int, vals)

        # Get the motion.
        try:
            motion = doc.xpath("//br")[-1].tail.strip()
        except IndexError:
            # Some of them mysteriously have no motion listed.
            motion = vote["action"]

        if not motion:
            motion = vote["action"]

        vote["motion"] = motion

        action = vote["action"]
        vote_url = vote["vote_url"]

        vote = VoteEvent(
            chamber=vote["chamber"],
            start_date=vote["date"],
            motion_text=vote["motion"],
            result="fail",  # placeholder
            classification="passage",
            bill=bill,
            bill_action=vote["action"],
        )
        vote.dedupe_key = vote_url  # URL contains sequence number
        vote.add_source(vote_url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("excused", excused_count)
        vote.set_count("absent", absent_count)

        for text in doc.xpath("//table")[2].xpath("tr/td/text()"):
            if not text.strip("\xa0"):
                continue
            v, name = filter(None, text.split("\xa0"))
            # Considering Name is brackets as short name
            regex = re.compile(r".*?\((.*?)\)")
            short_name = re.findall(regex, name)
            if len(short_name) > 0:
                note = "Short Name: " + short_name[0]
            else:
                note = ""
            # Name without brackets like 'Kary, Douglas'
            name = re.sub(r"[\(\[].*?[\)\]]", "", name)
            if v == "Y":
                vote.yes(name, note=note)
            elif v == "N":
                vote.no(name, note=note)
            elif v == "E":
                vote.vote("excused", name, note=note)
            elif v == "A":
                vote.vote("absent", name, note=note)

        # code to determine value of `passed`
        passed = None

        # some actions take a super majority, so we aren't just
        # comparing the yeas and nays here.
        for i in vote_passage_indicators:
            if i in action:
                passed = True
                break
        for i in vote_failure_indicators:
            if i in action and passed:
                # a quick explanation:  originally an exception was
                # thrown if both passage and failure indicators were
                # present because I thought that would be a bug in my
                # lists.  Then I found 2007 HB 160.
                # Now passed = False if the nays outnumber the yays..
                # I won't automatically mark it as passed if the yays
                # ounumber the nays because I don't know what requires
                # a supermajority in MT.
                if no_count >= yes_count:
                    passed = False
                    break
                else:
                    raise Exception("passage and failure indicator"
                                    "both present at: %s" % url)
            if i in action and passed is None:
                passed = False
                break
        for i in vote_ambiguous_indicators:
            if i in action:
                passed = yes_count > no_count
                break
        if passed is None:
            raise Exception("Unknown passage at: %s" % url)

        vote.result = "pass" if passed else "fail"

        return vote