Beispiel #1
0
    def asvote(self):
        v = VoteEvent(
            chamber=self.chamber(),
            start_date=self.date(),
            motion_text=self.motion(),
            result="pass" if self.passed() else "fail",
            classification="passage",
            bill=self.bill,
        )
        v.pupa_id = self.url  # URL contains sequence number
        v.set_count("yes", self.yes_count())
        v.set_count("no", self.no_count())
        v.set_count("other", self.other_count())

        for voter in self.yes_votes():
            v.yes(voter)
        for voter in self.no_votes():
            v.no(voter)
        for voter in self.other_votes():
            v.vote("other", voter)
        v.add_source(self.url)
        return v
Beispiel #2
0
    def parse_vote(
        self, bill, journal_entry_number, action, act_chamber, act_date, url
    ):
        # html = self.get(url).text
        # doc = lxml.html.fromstring(html)
        yes = no = other = 0
        result = ""
        vote_counts = action.split()
        for vote_count in vote_counts:
            if re.match(r"[\D][\d]", vote_count):
                if "Y" in vote_count:
                    yes = int(vote_count[1:])
                elif "N" in vote_count:
                    no = int(vote_count[1:])
                elif "E" in vote_count or "A" in vote_count:
                    other += int(vote_count[1:])

        if "PASSED" in action:
            result = "pass"
        elif "FAILED" in action:
            result = "fail"
        else:
            result = "pass" if yes > no else "fail"

        vote = VoteEvent(
            bill=bill,
            start_date=act_date.strftime("%Y-%m-%d"),
            chamber=act_chamber,
            motion_text=action + " #" + journal_entry_number,
            result=result,
            classification="passage",
        )

        vote.set_count("yes", yes)
        vote.set_count("no", no)
        vote.set_count("other", other)
        vote.add_source(url)

        yield vote
Beispiel #3
0
    def process_committee_vote(self, committee_action, bill):
        try:
            date = committee_action["ActionDate"]
            vote_info = committee_action["Vote"]

        except KeyError:
            self.logger.warning("Committee vote has no data. Skipping.")
            return
        date = self.date_format(date)

        other_count = 0
        for v in vote_info:
            vote_count = 0 if v["VoteCount"] == "" else int(v["VoteCount"])

            if v["VoteType"] == "Yes":
                yes_count = vote_count
            elif v["VoteType"] == "No":
                no_count = vote_count
            else:
                other_count += vote_count

        result = "fail"
        if yes_count > no_count:
            result = "pass"

        v = VoteEvent(
            chamber="legislature",
            start_date=date,
            motion_text="Committee Vote",
            result=result,
            classification="committee",
            bill=bill,
        )
        v.set_count("yes", yes_count)
        v.set_count("no", no_count)
        v.set_count("other", other_count)

        return v
    def scrape_vote(self, bill, date, url):
        page = self.get(url).json()

        location = page["actionLog"]["FullName"]
        if location:
            if "House" in location:
                chamber = "lower"
            elif "Senate" in location:
                chamber = "upper"
            elif "Joint" in location:
                chamber = "legislature"
            else:
                self.warning("Bad Vote chamber: '%s', skipping" % location)
                return
        else:
            self.warning("Bad Vote chamber: '%s', skipping" % location)
            return

        motion = page["actionLog"]["StatusText"]
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = page["Yeas"]
            no_count = page["Nays"]
            excused_count = page["Excused"]
            absent_count = page["Absent"]

            passed = yes_count > no_count

            if motion.startswith("Do Pass"):
                vtype = "passage"
            elif motion == "Concurred in amendments":
                vtype = "amendment"
            # commenting out until we add these back to OS-core
            # elif motion == "Veto override":
            #     vtype = "veto-override"
            else:
                vtype = []

            vote = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=motion,
                result="pass" if passed else "fail",
                classification=vtype,
                bill=bill,
            )
            # differentiate nearly identical votes
            vote.dedupe_key = url

            vote.add_source(url)
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", excused_count)
            vote.set_count("absent", absent_count)

            for person in page["RollCalls"]:
                option = person["Vote1"]
                if option in ("Aye", "Yea"):
                    vote.yes(person["UniqueName"])
                elif option == "Nay":
                    vote.no(person["UniqueName"])
                elif option == "Excused":
                    vote.vote("excused", person["UniqueName"])
                elif option == "Absent":
                    vote.vote("absent", person["UniqueName"])

            yield vote
    def parse_bill_actions_table(self, bill, action_table, bill_id, session,
                                 url, bill_chamber):

        # vote types that have been reconsidered since last vote of that type
        reconsiderations = set()

        for action in action_table.xpath("*")[1:]:
            date = action[0].text_content()
            date = dt.datetime.strptime(date, "%m/%d/%Y").strftime("%Y-%m-%d")
            actor_code = action[1].text_content().upper()
            string = action[2].text_content()
            actor = self._vote_type_map[actor_code]
            act_type, committees = categorize_action(string)
            # XXX: Translate short-code to full committee name for the
            #      matcher.

            real_committees = []

            if committees:
                for committee in committees:
                    try:
                        committee = self.short_ids[committee]["name"]
                        real_committees.append(committee)
                    except KeyError:
                        pass
            act = bill.add_action(string,
                                  date,
                                  chamber=actor,
                                  classification=act_type)

            for committee in real_committees:
                act.add_related_entity(name=committee,
                                       entity_type="organization")
            vote = self.parse_vote(string)

            if vote:
                v, motion = vote
                motion_text = (("Reconsider: " + motion)
                               if actor in reconsiderations else motion)
                vote = VoteEvent(
                    start_date=date,
                    chamber=actor,
                    bill=bill_id,
                    bill_chamber=bill_chamber,
                    legislative_session=session,
                    motion_text=motion_text,
                    result="pass" if "passed" in string.lower() else "fail",
                    classification="passage",
                )
                reconsiderations.discard(actor)
                vote.add_source(url)
                vote.set_count("yes", int(v["n_yes"] or 0))
                vote.set_count("no", int(v["n_no"] or 0))
                vote.set_count("not voting", int(v["n_excused"] or 0))
                for voter in split_specific_votes(v["yes"]):
                    voter = self.clean_voter_name(voter)
                    vote.yes(voter)
                for voter in split_specific_votes(v["yes_resv"]):
                    voter = self.clean_voter_name(voter)
                    vote.yes(voter)
                for voter in split_specific_votes(v["no"]):
                    voter = self.clean_voter_name(voter)
                    vote.no(voter)
                for voter in split_specific_votes(v["excused"]):
                    voter = self.clean_voter_name(voter)
                    vote.vote("not voting", voter)

                yield vote

            elif re.search("reconsider", string, re.IGNORECASE):
                reconsiderations.add(actor)
Beispiel #6
0
    def handle_page(self):
        (_, motion) = self.lines[5].split("FINAL ACTION:")
        motion = motion.strip()
        if not motion:
            self.scraper.warning("Vote appears to be empty")
            return

        vote_top_row = [
            self.lines.index(x) for x in self.lines
            if re.search(r"^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$", x)
        ][0]
        yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea")
        nay_columns_begin = self.lines[vote_top_row].index("Nay")

        votes = {"yes": [], "no": [], "other": []}
        for line in self.lines[(vote_top_row + 1):]:
            if line.strip():
                member = re.search(
                    r"""(?x)
                        ^\s+(?:[A-Z\-]+)?\s+    # Possible vote indicator
                        ([A-Z][a-z]+            # Name must have lower-case characters
                        [\w\-\s]+)              # Continue looking for the rest of the name
                        (?:,[A-Z\s]+?)?         # Leadership has an all-caps title
                        (?:\s{2,}.*)?           # Name ends when many spaces are seen
                        """,
                    line,
                ).group(1)
                # sometimes members have trailing X's from other motions in the
                # vote sheet we aren't collecting
                member = re.sub(r"(\s+X)+", "", member)
                # Usually non-voting members won't even have a code listed
                # Only a couple of codes indicate an actual vote:
                # "VA" (vote after roll call) and "VC" (vote change)
                did_vote = bool(re.search(r"^\s+(X|VA|VC)\s+[A-Z][a-z]", line))
                if did_vote:
                    # Check where the "X" or vote code is on the page
                    vote_column = len(line) - len(line.lstrip())
                    if vote_column <= yea_columns_end:
                        votes["yes"].append(member)
                    elif vote_column >= nay_columns_begin:
                        votes["no"].append(member)
                    else:
                        raise ValueError(
                            "Unparseable vote found for {0} in {1}:\n{2}".
                            format(member, self.url, line))
                else:
                    votes["other"].append(member)

            # End loop as soon as no more members are found
            else:
                break

        totals = re.search(r"(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS",
                           self.text).groups()
        yes_count = int(totals[0])
        no_count = int(totals[1])
        result = "pass" if (yes_count > no_count) else "fail"

        vote = VoteEvent(
            start_date=self.kwargs["date"],
            bill=self.kwargs["bill"],
            chamber="upper",
            motion_text=motion,
            classification="committee",
            result=result,
        )
        vote.add_source(self.url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", len(votes["other"]))

        # set voters
        for vtype, voters in votes.items():
            for voter in voters:
                voter = voter.strip()
                # Removes the few voter names with a ton of extra spaces with  VA at the end.
                # Ex: Cruz                                                               VA
                if "  VA" in voter:
                    voter = " ".join(voter.split()[:-2])
                if len(voter) > 0:
                    vote.vote(vtype, voter)

        yield vote
Beispiel #7
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        for category in self._categories:
            leg_listing_url = (
                self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}"
            )
            resp = requests.post(leg_listing_url, headers=self._headers, verify=False,)
            resp.raise_for_status()
            leg_listing = resp.json()

            for leg in leg_listing:

                bill = Bill(
                    leg["legislationNumber"],
                    legislative_session=session,
                    title=leg["title"],
                    classification=category["name"],
                )
                bill.add_source(leg_listing_url)
                bill_url = (
                    f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}"
                )
                bill.add_source(bill_url)

                if leg['lawNumber']:
                    bill.extras['lawNumber'] = leg['lawNumber']

                # Actions
                for hist in leg["legislationHistory"]:
                    hist_date = datetime.datetime.strptime(
                        hist["actionDate"], "%b %d, %Y"
                    )
                    hist_date = self._TZ.localize(hist_date)
                    hist_action = hist["actionDescription"]
                    if hist_action.split()[0] in ["OtherAmendment", "OtherMotion"]:
                        hist_action = hist_action[5:]
                    hist_class = self.classify_action(hist_action)

                    if "mayor" in hist_action.lower():
                        actor = "executive"
                    else:
                        actor = "legislature"
                    bill.add_action(
                        hist_action, hist_date, classification=hist_class, chamber=actor
                    )

                    # Documents with download links
                    if hist["downloadURL"] and ("download" in hist["downloadURL"]):
                        download = hist["downloadURL"]
                        if not download.startswith("http"):
                            download = "https://lims.dccouncil.us/" + download

                        mimetype = (
                            "application/pdf" if download.endswith("pdf") else None
                        )
                        is_version = False
                        # figure out if it's a version from type/name
                        possible_version_types = [
                            "SignedAct",
                            "Introduction",
                            "Enrollment",
                            "Engrossment",
                        ]
                        for vt in possible_version_types:
                            if vt.lower() in download.lower():
                                is_version = True
                                doc_type = vt

                        if "amendment" in download.lower():
                            doc_type = "Amendment"

                        if is_version:
                            bill.add_version_link(
                                doc_type,
                                download,
                                media_type=mimetype,
                                on_duplicate="ignore",
                            )
                        else:
                            bill.add_document_link(
                                hist["actionDescription"],
                                download,
                                media_type=mimetype,
                                on_duplicate="ignore",
                            )

                # Grabs Legislation details
                leg_details_url = (
                    self._API_BASE_URL
                    + f"LegislationDetails/{leg['legislationNumber']}"
                )
                details_resp = requests.get(
                    leg_details_url, headers=self._headers, verify=False,
                )
                details_resp.raise_for_status()
                leg_details = details_resp.json()

                # Sponsors
                for i in leg_details["introducers"]:
                    name = i["memberName"]
                    bill.add_sponsorship(
                        name,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

                # Co-sponsor
                if leg_details["coSponsors"]:
                    for cs in leg_details["coSponsors"]:
                        name = i["memberName"]
                        bill.add_sponsorship(
                            name,
                            classification="cosponsor",
                            entity_type="person",
                            primary=True,
                        )

                # Committee Hearing Doc
                for commHearing in leg_details["committeeHearing"]:
                    if commHearing["hearingRecord"]:
                        bill.add_document_link(
                            commHearing["hearingType"],
                            commHearing["hearingRecord"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

                for committeeMarkup in leg_details["committeeMarkup"]:
                    if committeeMarkup["committeeReport"]:
                        bill.add_document_link(
                            "Committee Markup",
                            committeeMarkup["committeeReport"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

                # Actions and Votes
                if leg_details["actions"]:
                    # To prevent duplicate votes
                    vote_ids = []
                    for act in leg_details["actions"]:
                        action_name = act["action"]
                        action_date = datetime.datetime.strptime(
                            act["actionDate"][:10], "%Y-%m-%d"
                        )
                        action_date = self._TZ.localize(action_date)

                        if action_name.split()[0] == "Other":
                            action_name = " ".join(action_name.split()[1:])

                        if "mayor" in action_name.lower():
                            actor = "executive"
                        else:
                            actor = "legislature"

                        # Documents and Versions
                        if act["attachment"]:
                            mimetype = (
                                "application/pdf"
                                if act["attachment"].endswith("pdf")
                                else None
                            )
                            is_version = False
                            # figure out if it's a version from type/name
                            possible_version_types = [
                                "SignedAct",
                                "Introduction",
                                "Enrollment",
                                "Engrossment",
                            ]
                            for vt in possible_version_types:
                                if vt.lower() in act["attachment"].lower():
                                    is_version = True
                                    doc_type = vt

                            if "amendment" in act["attachment"].lower():
                                doc_type = "Amendment"

                            if is_version:
                                bill.add_version_link(
                                    doc_type,
                                    act["attachment"],
                                    media_type=mimetype,
                                    on_duplicate="ignore",
                                )
                            else:
                                bill.add_document_link(
                                    doc_type,
                                    act["attachment"],
                                    media_type=mimetype,
                                    on_duplicate="ignore",
                                )

                        # Votes
                        if act["voteDetails"]:
                            result = act["voteDetails"]["voteResult"]
                            if result:
                                status = self._vote_statuses[result.lower()]
                                id_text = (
                                    str(leg["legislationNumber"])
                                    + "-"
                                    + action_name
                                    + "-"
                                    + result
                                )
                                if id_text not in vote_ids:
                                    vote_ids.append(id_text)
                                    action_class = self.classify_action(action_name)
                                    v = VoteEvent(
                                        identifier=id_text,
                                        chamber=actor,
                                        start_date=action_date,
                                        motion_text=action_name,
                                        result=status,
                                        classification=action_class,
                                        bill=bill,
                                    )
                                    v.add_source(leg_listing_url)

                                    yes_count = (
                                        no_count
                                    ) = absent_count = abstain_count = other_count = 0
                                    for leg_vote in act["voteDetails"]["votes"]:
                                        mem_name = leg_vote["councilMember"]
                                        if leg_vote["vote"] == "Yes":
                                            yes_count += 1
                                            v.yes(mem_name)
                                        elif leg_vote["vote"] == "No":
                                            no_count += 1
                                            v.no(mem_name)
                                        elif leg_vote["vote"] == "Absent":
                                            absent_count += 1
                                            v.vote("absent", mem_name)
                                        elif leg_vote["vote"] == "Recused":
                                            v.vote("abstain", mem_name)
                                            abstain_count += 1
                                        elif leg_vote["vote"] == "Present":
                                            v.vote("other", mem_name)
                                            other_count += 1
                                        else:
                                            # Incase anything new pops up
                                            other_count += 1
                                            v.vote("other", mem_name)

                                    v.set_count("yes", yes_count)
                                    v.set_count("no", no_count)
                                    v.set_count("absent", absent_count)
                                    v.set_count("abstain", abstain_count)
                                    v.set_count("other", other_count)
                                    yield v

                yield bill
Beispiel #8
0
    def scrape_senate_vote(self, session, period, roll_call):
        url = (
            "https://www.senate.gov/legislative/LIS/roll_call_votes/vote{session}{period}/"
            "vote_{session}_{period}_{vote_id}.xml")
        url = url.format(session=session, period=period, vote_id=roll_call)
        page = lxml.html.fromstring(self.get(url).content)

        vote_date = page.xpath("//roll_call_vote/vote_date/text()")[0].strip()
        when = self._TZ.localize(
            datetime.datetime.strptime(vote_date, "%B %d, %Y, %H:%M %p"))

        roll_call = page.xpath("//roll_call_vote/vote_number/text()")[0]
        vote_id = "us-{}-upper-{}".format(when.year, roll_call)

        # note: not everthing the senate votes on is a bill, this is OK
        # non bills include nominations and impeachments
        doc_type = page.xpath(
            "//roll_call_vote/document/document_type/text()")[0]

        if page.xpath(
                "//roll_call_vote/amendment/amendment_to_document_number/text()"
        ):
            bill_id = page.xpath(
                "//roll_call_vote/amendment/amendment_to_document_number/text()"
            )[0].replace(".", "")
        else:
            bill_id = page.xpath(
                "//roll_call_vote/document/document_name/text()")[0].replace(
                    ".", "")

        motion = page.xpath("//roll_call_vote/vote_question_text/text()")[0]

        result_text = page.xpath("//roll_call_vote/vote_result/text()")[0]

        result = self.senate_statuses[result_text]

        vote = VoteEvent(
            start_date=when,
            bill_chamber="lower" if doc_type[0] == "H" else "upper",
            motion_text=motion,
            classification="passage",  # TODO
            result=result,
            legislative_session=session,
            identifier=vote_id,
            bill=bill_id,
            chamber="upper",
        )

        vote.add_source(url)

        vote.extras["senate-rollcall-num"] = roll_call

        yeas = page.xpath("//roll_call_vote/count/yeas/text()")[0]
        nays = page.xpath("//roll_call_vote/count/nays/text()")[0]

        if page.xpath("//roll_call_vote/count/absent/text()"):
            absents = page.xpath("//roll_call_vote/count/absent/text()")[0]
        else:
            absents = 0

        if page.xpath("//roll_call_vote/count/present/text()"):
            presents = page.xpath("//roll_call_vote/count/present/text()")[0]
        else:
            presents = 0

        vote.set_count("yes", int(yeas))
        vote.set_count("no", int(nays))
        vote.set_count("absent", int(absents))
        vote.set_count("abstain", int(presents))

        for row in page.xpath("//roll_call_vote/members/member"):
            lis_id = row.xpath("lis_member_id/text()")[0]
            name = row.xpath("member_full/text()")[0]
            choice = row.xpath("vote_cast/text()")[0]

            vote.vote(self.vote_codes[choice], name, note=lis_id)

        yield vote
Beispiel #9
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " "))

        seen_rcs = set()

        re_ns = "http://exslt.org/regular-expressions"
        path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={"re": re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if "HOUSE" in header.xpath("string()"):
                chamber = "lower"
                motion_index = 8
            else:
                chamber = "upper"
                motion_index = 13

            motion = header.xpath("string(following-sibling::p[%d])" %
                                  motion_index).strip()
            motion = re.sub(r"\s+", " ", motion)
            if not motion.strip():
                self.warning("Motion text not found")
                return
            match = re.match(r"^(.*) (PASSED|FAILED)$", motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == "PASSED"
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ")
            rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1)

            if rcs in seen_rcs:
                continue
            else:
                seen_rcs.add(rcs)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r"\d+/\d+/\d+", date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace("\r\n", " ").strip()
                if "*****" in line:
                    break
                regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL "
                         r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)")
                match = re.match(regex, line)
                if match:
                    if match.group(1) == "YEAS" and "RCS#" not in line:
                        vtype = "yes"
                        seen_yes = True
                    elif match.group(1) == "NAYS" and seen_yes:
                        vtype = "no"
                    elif match.group(1) == "VACANT":
                        continue  # skip these
                    elif seen_yes:
                        vtype = "other"
                    if seen_yes and match.group(3).strip():
                        self.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split("   "):
                        if not name:
                            continue
                        if "HOUSE" in name or "SENATE " in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts["yes"] > (counts["no"] + counts["other"])

            vote = Vote(
                chamber=chamber,
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.set_count("yes", counts["yes"])
            vote.set_count("no", counts["no"])
            vote.set_count("other", counts["other"])
            vote.dedupe_key = url + "#" + rcs

            vote.add_source(url)

            for name in votes["yes"]:
                vote.yes(name)
            for name in votes["no"]:
                if ":" in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes["other"]:
                vote.vote("other", name)

            yield vote
Beispiel #10
0
    def scrape_assembly_votes(self, session, bill, assembly_url, bill_id):

        # parse the bill data page, finding the latest html text
        url = assembly_url + "&Floor%26nbspVotes=Y"

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        if "Votes:" in doc.text_content():
            vote_motions = []
            additional_votes_on_motion = 2
            for table in doc.xpath("//table"):

                date = table.xpath('caption/span[contains(., "DATE:")]')
                date = next(date[0].itersiblings()).text
                date = datetime.datetime.strptime(date, "%m/%d/%Y")
                date = eastern.localize(date)
                date = date.isoformat()

                spanText = table.xpath("caption/span/text()")
                motion = spanText[2].strip() + spanText[3].strip()
                if motion in vote_motions:
                    motion = motion + f" - Vote {additional_votes_on_motion}"
                    additional_votes_on_motion += 1
                else:
                    vote_motions.append(motion)

                votes = (
                    table.xpath("caption/span/span")[0].text.split(":")[1].split("/")
                )
                yes_count, no_count = map(int, votes)
                passed = yes_count > no_count
                vote = VoteEvent(
                    chamber="lower",
                    start_date=date,
                    motion_text=motion,
                    bill=bill,
                    result="pass" if passed else "fail",
                    classification="passage",
                )

                vote.set_count("yes", yes_count)
                vote.set_count("no", no_count)
                absent_count = 0
                excused_count = 0
                tds = table.xpath("tr/td/text()")
                votes = [tds[i : i + 2] for i in range(0, len(tds), 2)]

                vote_dictionary = {
                    "Y": "yes",
                    "NO": "no",
                    "ER": "excused",
                    "AB": "absent",
                    "NV": "not voting",
                    "EL": "other",
                }

                for vote_pair in votes:
                    name, vote_val = vote_pair
                    vote.vote(vote_dictionary[vote_val], name)
                    if vote_val == "AB":
                        absent_count += 1
                    elif vote_val == "ER":
                        excused_count += 1

                vote.set_count("absent", absent_count)
                vote.set_count("excused", excused_count)
                vote.add_source(url)
                vote.dedupe_key = url + motion + spanText[1]

                yield vote
Beispiel #11
0
    def scrape_votes(self, session):
        votes = {}
        other_counts = defaultdict(int)
        last_line = []
        vote_url = "http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt"
        lines = self.get(vote_url).content.decode("utf-8").splitlines()

        for line in lines:

            if len(line) < 2:
                continue

            if line.strip() == "":
                continue

            line = line.split("|")
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning("used bad vote line")
                else:
                    last_line = line
                    self.warning("bad vote line %s" % "|".join(line))
            session_yr = line[0].replace("\xef\xbb\xbf", "")
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            # present = int(line[7])
            # absent = int(line[8])
            motion = line[11].strip() or "[not available]"

            if session_yr == session and bill_id in self.bills_by_id:
                actor = "lower" if body == "H" else "upper"
                time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p")
                time = pytz.timezone("America/New_York").localize(
                    time).isoformat()
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(
                    chamber=actor,
                    start_date=time,
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    classification="passage",
                    bill=self.bills_by_id[bill_id],
                )
                vote.set_count("yes", yeas)
                vote.set_count("no", nays)
                vote.add_source(vote_url)
                vote.pupa_id = session_yr + body + vote_num  # unique ID for vote
                votes[body + vote_num] = vote

        for line in (self.get(
                "http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt"
        ).content.decode("utf-8").splitlines()):
            if len(line) < 2:
                continue

            # 2016|H|2|330795||Yea|
            # 2012    | H   | 2    | 330795  | 964 |  HB309  | Yea | 1/4/2012 8:27:03 PM
            session_yr, body, v_num, _, employee, bill_id, vote, date = line.split(
                "|")

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = " ".join(self.legislators[employee]["name"].split())
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body + v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body + v_num))
                    continue
                # code = self.legislators[employee]['seat']

                if vote == "Yea":
                    votes[body + v_num].yes(leg)
                elif vote == "Nay":
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].vote("other", leg)
                    # hack-ish, but will keep the vote count sync'd
                    other_counts[body + v_num] += 1
                    votes[body + v_num].set_count("other",
                                                  other_counts[body + v_num])
        for vote in votes.values():
            yield vote
    def scrape_vote_history(self, bill, vurl):
        """
         Obtain the information on a vote and link it to the related Bill
        :param bill: related bill
        :param vurl: source for the voteEvent information.
        :return: voteEvent object
        """
        html = self.get(vurl).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(vurl)

        # skip first two rows
        for row in doc.xpath("//table/tr")[2:]:
            tds = row.getchildren()
            if len(tds) != 11:
                self.warning("irregular vote row: %s" % vurl)
                continue
            (
                timestamp,
                motion,
                vote,
                yeas,
                nays,
                nv,
                exc,
                pres,
                abst,
                total,
                result,
            ) = tds

            timestamp = timestamp.text.replace("\xa0", " ")
            timestamp = datetime.datetime.strptime(timestamp, "%m/%d/%Y %H:%M %p")

            yeas = int(yeas.text)
            nays = int(nays.text)
            others = int(nv.text) + int(exc.text) + int(abst.text) + int(pres.text)
            assert yeas + nays + others == int(total.text)

            if result.text == "Passed":
                passed = "pass"
            else:
                passed = "fail"

            vote_link = vote.xpath("a")[0]
            if "[H]" in vote_link.text:
                chamber = "lower"
            else:
                chamber = "upper"

            vote = VoteEvent(
                chamber=chamber,  # 'upper' or 'lower'
                start_date=timestamp.strftime("%Y-%m-%d"),  # 'YYYY-MM-DD' format
                motion_text=motion.text,
                result=passed,
                classification="passage",  # Can also be 'other'
                # Provide a Bill instance to link with the VoteEvent...
                bill=bill,
            )

            vote.set_count("yes", yeas)
            vote.set_count("no", nays)
            vote.set_count("other", others)

            vote.add_source(vurl)

            # obtain vote rollcall from pdf and add it to the VoteEvent object
            rollcall_pdf = vote_link.get("href")
            self.scrape_rollcall(vote, rollcall_pdf)
            vote.add_source(rollcall_pdf)
            if rollcall_pdf in self._seen_vote_ids:
                self.warning("duplicate usage of %s, skipping", rollcall_pdf)
                continue
            else:
                self._seen_vote_ids.add(rollcall_pdf)
            vote.dedupe_key = rollcall_pdf  # distinct KEY for each one

            yield vote
Beispiel #13
0
    def scrape_action_page(self, bill, page):
        action_rows = page.xpath("//tbody/tr")
        for row in action_rows:
            action_date = row.xpath("td[1]/text()")[0]
            action_date = datetime.strptime(action_date, "%m/%d/%Y")
            action_year = action_date.year
            action_date = action_date.strftime("%Y-%m-%d")

            if row.xpath("td[2]/text()"):
                action_actor = row.xpath("td[2]/text()")[0]
                action_actor = self.chamber_map_reverse[action_actor.strip()]

            action_name = row.xpath("string(td[3])")

            # House votes
            if "Supplement" in action_name:
                actor = "lower"

                if not re.findall(r"(.+)-\s*\d+\s*YEAS", action_name):
                    self.warning(
                        "vote {} did not match regex, skipping".format(
                            action_name))
                    continue

                vote_action = re.findall(r"(.+)-\s*\d+\s*YEAS",
                                         action_name)[0].strip()

                y = int(re.findall(r"(\d+)\s*YEAS", action_name)[0])
                n = int(re.findall(r"(\d+)\s*NAYS", action_name)[0])

                # get supplement number
                n_supplement = int(
                    re.findall(r"No\.\s*(\d+)", action_name, re.IGNORECASE)[0])
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result="pass" if y > n else "fail",
                    classification="passage",
                    bill=bill,
                )
                cached_vote.set_count("yes", y)
                cached_vote.set_count("no", n)

                housevote_pdf = (
                    "https://malegislature.gov/Journal/House/{}/{}/RollCalls".
                    format(bill.legislative_session, action_year))
                self.scrape_house_vote(cached_vote, housevote_pdf,
                                       n_supplement)
                cached_vote.add_source(housevote_pdf)

                cached_vote.dedupe_key = "{}#{}".format(
                    housevote_pdf, n_supplement)

                # XXX: disabled house votes on 8/1 to try to get MA importing again
                # will leaving this in and commented out once we resolve the ID issue
                # yield cached_vote

            # Senate votes
            if "Roll Call" in action_name:
                actor = "upper"
                # placeholder
                vote_action = action_name.split(" -")[0]
                # 2019 H86 Breaks our regex,
                # Ordered to a third reading --
                # see Senate   Roll Call #25 and House Roll Call 56
                if "yeas" in action_name and "nays" in action_name:
                    try:
                        y, n = re.search(r"(\d+) yeas .*? (\d+) nays",
                                         action_name.lower()).groups()
                        y = int(y)
                        n = int(n)
                    except AttributeError:
                        y = int(
                            re.search(r"yeas\s+(\d+)",
                                      action_name.lower()).group(1))
                        n = int(
                            re.search(r"nays\s+(\d+)",
                                      action_name.lower()).group(1))

                    # TODO: other count isn't included, set later
                    cached_vote = VoteEvent(
                        chamber=actor,
                        start_date=action_date,
                        motion_text=vote_action,
                        result="pass" if y > n else "fail",
                        classification="passage",
                        bill=bill,
                    )
                    cached_vote.set_count("yes", y)
                    cached_vote.set_count("no", n)

                    rollcall_pdf = "http://malegislature.gov" + row.xpath(
                        "string(td[3]/a/@href)")
                    self.scrape_senate_vote(cached_vote, rollcall_pdf)
                    cached_vote.add_source(rollcall_pdf)
                    cached_vote.dedupe_key = rollcall_pdf
                    # XXX: also disabled, see above note
                    # yield cached_vote

            attrs = self.categorizer.categorize(action_name)
            action = bill.add_action(
                action_name.strip(),
                action_date,
                chamber=action_actor,
                classification=attrs["classification"],
            )
            for com in attrs.get("committees", []):
                com = com.strip()
                action.add_related_entity(com, entity_type="organization")
Beispiel #14
0
    def scrape_vote(self, bill, vote_id, session):
        vote_url = (
            "https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId"
        )
        form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""}

        self.info("Fetching vote {} for {}".format(vote_id, bill.identifier))
        page = self.post(url=vote_url, data=form, allow_redirects=True).json()
        if page:
            roll = page["Model"]
            vote_chamber = self.chamber_map[roll["ChamberName"]]
            # "7/1/16 01:00 AM"
            vote_date = dt.datetime.strptime(
                roll["TakenAtDateTime"],
                "%m/%d/%y %I:%M %p").strftime("%Y-%m-%d")

            # TODO: What does this code mean?
            vote_motion = roll["RollCallVoteType"]

            vote_passed = "pass" if roll[
                "RollCallStatus"] == "Passed" else "fail"
            other_count = (int(roll["NotVotingCount"]) +
                           int(roll["VacantVoteCount"]) +
                           int(roll["AbsentVoteCount"]) +
                           int(roll["ConflictVoteCount"]))
            vote = VoteEvent(
                chamber=vote_chamber,
                start_date=vote_date,
                motion_text=vote_motion,
                result=vote_passed,
                bill=bill,
                legislative_session=session,
                classification=[],
            )
            vote_pdf_url = ("https://legis.delaware.gov"
                            "/json/RollCallController/GenerateRollCallPdf"
                            "?rollCallId={}&chamberId={}".format(
                                vote_id, self.chamber_codes[vote_chamber]))
            # Vote URL is just a generic search URL with POSTed data,
            # so provide a different link
            vote.add_source(vote_pdf_url)
            vote.dedupe_key = vote_pdf_url
            vote.set_count("yes", roll["YesVoteCount"])
            vote.set_count("no", roll["NoVoteCount"])
            vote.set_count("other", other_count)

            for row in roll["AssemblyMemberVotes"]:
                # AssemblyMemberId looks like it should work here,
                # but for some sessions it's bugged to only return session
                try:
                    voter = self.legislators_by_short[str(row["ShortName"])]
                    name = voter["DisplayName"]
                except KeyError:
                    self.warning("could not find legislator short name %s",
                                 row["ShortName"])
                    name = row["ShortName"]
                if row["SelectVoteTypeCode"] == "Y":
                    vote.yes(name)
                elif row["SelectVoteTypeCode"] == "N":
                    vote.no(name)
                else:
                    vote.vote("other", name)

            yield vote
Beispiel #15
0
    def scrape_chamber(self, chamber, session):
        chamber_name = "house" if chamber == "lower" else "senate"
        session_slug = {
            "62": "62-2011",
            "63": "63-2013",
            "64": "64-2015",
            "65": "65-2017",
            "66": "66-2019",
        }[session]

        # Open the index page of the session's Registers, and open each
        url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % (
            session_slug,
            chamber_name,
        )
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            # Initialize information about the vote parsing
            results = {}
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""
            bills = []

            # Determine which URLs the information was pulled from
            pdf_url = pdf.attrib["href"]

            try:
                (path, response) = self.urlretrieve(pdf_url)
            except requests.exceptions.ConnectionError:
                continue

            # Convert the PDF to text
            data = convert_pdf(path, type="text").decode("utf-8")
            os.unlink(path)

            # Determine the date of the document
            date = re.findall(date_re, data)
            if date:
                date = date[0][0]
                cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y")
            else:
                # If no date is found anywhere, do not process the document
                self.warning("No date was found for the document; skipping.")
                continue

            # Check each line of the text for motion and vote information
            lines = data.splitlines()
            for line in lines:
                # Ignore lines with no information
                if (re.search(chamber_re, line) or re.search(date_re, line)
                        or re.search(page_re, line) or line.strip() == ""):
                    pass

                # Ensure that motion and vote capturing are not _both_ active
                elif in_motion and in_vote:
                    raise AssertionError(
                        "Scraper should not be simultaneously processing " +
                        "motion name and votes, as it is for this motion: " +
                        cur_motion)

                # Start capturing motion text after a ROLL CALL header
                elif not in_motion and not in_vote:
                    if line.strip() == "ROLL CALL":
                        in_motion = True

                elif in_motion and not in_vote:
                    if cur_motion == "":
                        cur_motion = line.strip()
                    else:
                        cur_motion = cur_motion + " " + line.strip()

                    # ABSENT AND NOT VOTING marks the end of each motion name
                    # In this case, prepare to capture votes
                    if line.strip().endswith(
                            "VOTING") or line.strip().endswith("VOTING."):
                        in_motion = False
                        in_vote = True

                elif not in_motion and in_vote:
                    # Ignore appointments and confirmations
                    if "The Senate advises and consents to the appointment" in line:
                        in_vote = False
                        cur_vote = None
                        results = {}
                        cur_motion = ""
                        bills = []

                    # If votes are being processed, record the voting members
                    elif ":" in line:
                        cur_vote, who = (x.strip() for x in line.split(":", 1))
                        who = [
                            x.strip() for x in who.split(";")
                            if x.strip() != ""
                        ]
                        results[cur_vote] = who

                        name_may_be_continued = False if line.endswith(
                            ";") else True

                    # Extracts bill numbers in the closing text
                    # used for when the closing text is multiple lines.
                    elif (cur_vote is not None
                          and re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)
                          and not any(x in line.lower() for x in [
                              "passed",
                              "adopted",
                              "sustained",
                              "prevailed",
                              "lost",
                              "failed",
                          ])):
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))

                    elif cur_vote is not None and not any(x in line.lower()
                                                          for x in [
                                                              "passed",
                                                              "adopted",
                                                              "sustained",
                                                              "prevailed",
                                                              "lost",
                                                              "failed",
                                                          ]):
                        who = [
                            x.strip() for x in line.split(";")
                            if x.strip() != ""
                        ]

                        if name_may_be_continued:
                            results[cur_vote][-1] = (results[cur_vote][-1] +
                                                     " " + who.pop(0))

                        name_may_be_continued = False if line.endswith(
                            ";") else True

                        results[cur_vote].extend(who)

                    # At the conclusion of a vote, save its data
                    elif any(x in line.lower() for x in [
                            "passed",
                            "adopted",
                            "sustained",
                            "prevailed",
                            "lost",
                            "failed",
                    ]):

                        in_vote = False
                        cur_vote = None

                        # Identify what is being voted on
                        # Throw a warning if impropper informaiton found
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))
                        if bills == [] or cur_motion.strip() == "":
                            results = {}
                            cur_motion = ""
                            self.warning("No motion or bill name found: " +
                                         "motion name: " + cur_motion + "; " +
                                         "decision text: " + line.strip())
                            continue

                        # If votes are found in the motion name, throw an error
                        if "YEAS:" in cur_motion or "NAYS:" in cur_motion:
                            raise AssertionError(
                                "Vote data found in motion name: " +
                                cur_motion)

                        # Use the collected results to determine who voted how
                        keys = {
                            "YEAS": "yes",
                            "NAYS": "no",
                            "ABSENT AND NOT VOTING": "other",
                        }
                        res = {}
                        for key in keys:
                            if key in results:
                                res[keys[key]] = results[key]
                            else:
                                res[keys[key]] = []

                        # Count the number of members voting each way
                        yes, no, other = (
                            len(res["yes"]),
                            len(res["no"]),
                            len(res["other"]),
                        )
                        chambers = {
                            "H": "lower",
                            "S": "upper",
                            "J": "legislature"
                        }

                        # Almost all of the time, a vote only applies to one bill and this loop
                        # will only be run once.
                        # Some exceptions exist.

                        for bill in bills:

                            cur_bill_id = "%s%s%s %s" % bill

                            # Identify the source chamber for the bill
                            try:
                                bc = chambers[cur_bill_id[0]]
                            except KeyError:
                                bc = "other"

                            # Determine whether or not the vote passed
                            if "over the governor's veto" in cur_motion.lower(
                            ):
                                VETO_SUPERMAJORITY = 2 / 3
                                passed = yes / (yes + no) > VETO_SUPERMAJORITY
                            else:
                                passed = yes > no
                            # Create a Vote object based on the scraped information
                            vote = Vote(
                                chamber=chamber,
                                start_date=cur_date.strftime("%Y-%m-%d"),
                                motion_text=cur_motion,
                                result="pass" if passed else "fail",
                                legislative_session=session,
                                classification="passage",
                                bill=cur_bill_id,
                                bill_chamber=bc,
                            )

                            vote.add_source(pdf_url)
                            vote.add_source(url)
                            vote.set_count("yes", yes)
                            vote.set_count("no", no)
                            vote.set_count("other", other)
                            # For each category of voting members,
                            # add the individuals to the Vote object
                            for key in res:
                                for voter in res[key]:
                                    vote.vote(key, voter)

                            # Check the vote counts in the motion text against
                            # the parsed results
                            for category_name in keys.keys():
                                # Need to search for the singular, not plural, in the text
                                # so it can find, for example,  " 1 NAY "
                                vote_re = r"(\d+)\s{}".format(
                                    category_name[:-1])
                                motion_count = int(
                                    re.findall(vote_re, cur_motion)[0])

                                for item in vote.counts:
                                    if item["option"] == keys[category_name]:
                                        vote_count = item["value"]

                                if motion_count != vote_count:
                                    self.warning(
                                        "Motion text vote counts ({}) ".format(
                                            motion_count) +
                                        "differed from roll call counts ({}) ".
                                        format(vote_count) +
                                        "for {0} on {1}".format(
                                            category_name, cur_bill_id))

                                    for item in vote.counts:
                                        if item["option"] == keys[
                                                category_name]:
                                            vote_count = motion_count

                            yield vote

                        # With the vote successfully processed,
                        # wipe its data and continue to the next one
                        results = {}
                        cur_motion = ""
                        bills = []
Beispiel #16
0
    def scrape_journal(self, url, chamber, session, date):

        filename, response = self.urlretrieve(url)
        self.logger.info("Saved journal to %r" % filename)
        all_text = convert_pdf(filename, type="text")

        lines = all_text.split(b"\n")
        lines = [line.decode("utf-8") for line in lines]
        lines = [
            line.strip()
            .replace("–", "-")
            .replace("―", '"')
            .replace("‖", '"')
            .replace("“", '"')
            .replace("”", '"')
            for line in lines
        ]

        # Do not process headers or completely empty lines
        header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+"
        header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day"
        lines = iter(
            [
                line
                for line in lines
                if not (
                    line == ""
                    or re.match(header_date_re, line)
                    or re.match(header_journal_re, line)
                )
            ]
        )

        # bill_id -> motion -> count
        motions_per_bill = collections.defaultdict(collections.Counter)

        for line in lines:
            # Go through with vote parse if any of
            # these conditions match.
            if not line.startswith("On the question") or "shall" not in line.lower():
                continue

            # Get the bill_id
            bill_id = None
            bill_re = r"\(\s*([A-Z\.]+\s\d+)\s*\)"

            # The Senate ends its motion text with a vote announcement
            if chamber == "upper":
                end_of_motion_re = r".* the vote was:\s*"
            # The House may or may not end motion text with a bill name
            elif chamber == "lower":
                end_of_motion_re = r'.*Shall.*(?:\?"?|")(\s{})?\s*'.format(bill_re)

            while not re.match(end_of_motion_re, line, re.IGNORECASE):
                line += " " + next(lines)

            try:
                bill_id = re.search(bill_re, line).group(1)
            except AttributeError:
                self.warning(
                    "This motion did not pertain to legislation: {}".format(line)
                )
                continue

            # Get the motion text
            motion_re = r"""
                    ^On\sthe\squestion\s  # Precedes any motion
                    "+  # Motion is preceded by a quote mark (or two)
                    (Shall\s.+?\??)  # The motion text begins with "Shall"
                    \s*(?:\?"?|"|’)\s+  # Motion is followed by a question mark and/or a quote mark
                    (?:{})?  # If the vote regards a bill, its number is listed
                    {}  # Senate has trailing text
                    \s*$
                    """.format(
                # in at least one case [SF 457 from 2020] the bill number is followed by )0
                # seemingly just a typo, this gets around that
                bill_re,
                r",?.*?the\svote\swas:" if chamber == "upper" else r"\d?",
            )
            # print("motion candidate line:", line)
            motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE)
            if motion:
                motion = motion.group(1)

            for word, letter in (("Senate", "S"), ("House", "H"), ("File", "F")):

                if bill_id is None:
                    return

                bill_id = bill_id.replace(word, letter)

            bill_id = bill_id.replace(".", "")

            bill_chamber = dict(h="lower", s="upper")[bill_id.lower()[0]]
            votes, passed = self.parse_votes(lines)

            # at the very least, there should be a majority
            # for the bill to have passed, so check that,
            # but if the bill didn't pass, it could still be OK if it got a majority
            # eg constitutional amendments
            if not (
                (passed == (votes["yes_count"] > votes["no_count"])) or (not passed)
            ):
                self.error("The bill passed without a majority?")
                raise ValueError("invalid vote")

            # also throw a warning if the bill failed but got a majority
            # it could be OK, but is probably something we'd want to check
            if not passed and votes["yes_count"] > votes["no_count"]:
                self.logger.warning(
                    "The bill got a majority but did not pass. "
                    "Could be worth confirming."
                )

            result = ""
            if passed:
                result = "pass"
            else:
                result = "fail"

            # check for duplicate motions and number second and up if needed
            motion_text = re.sub("\xad", "-", motion)
            motions_per_bill[bill_id][motion_text] += 1
            new_count = motions_per_bill[bill_id][motion_text]
            if new_count > 1:
                motion_text += f" #{new_count}"

            vote = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=motion_text,
                result=result,
                classification="passage",
                legislative_session=session,
                bill=bill_id,
                bill_chamber=bill_chamber,
            )

            # add votes and counts
            for vtype in ("yes", "no", "absent", "abstain"):
                vcount = votes["{}_count".format(vtype)] or 0
                vote.set_count(vtype, vcount)
                for voter in votes["{}_votes".format(vtype)]:
                    vote.vote(vtype, voter)

            vote.add_source(url)
            yield vote
Beispiel #17
0
    def scrape_vote(self, session, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath("//font/text()")[2]
        except IndexError:
            self.warning("Vote Summary Page Broken ")
            return

        # eg. http://leg.colorado.gov/content/sb18-033vote563ce6
        if ("AM" in motion or "PM" in motion) and "/" in motion:
            motion = "Motion not given."

        if "withdrawn" not in motion:
            yes_no_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Aye')]]/font/text()")
            other_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Absent')]]/font/text()")
            abstain_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'17C')]]/font/text()")

            if not yes_no_counts:
                self.info("Missing yes no count")
                return

            yes_count = int(yes_no_counts[0])
            no_count = int(yes_no_counts[2])
            exc_count = int(other_counts[2])
            absent_count = int(other_counts[0])
            abstain_count = 0
            if abstain_counts:
                abstain_count = int(abstain_counts[0])

            # fix for
            # http://leg.colorado.gov/content/hb19-1029vote65e72e
            if absent_count == -1:
                absent_count = 0

            passed = yes_count > no_count
            vote = VoteEvent(
                chamber=chamber,
                start_date=self._tz.localize(date),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.dedupe_key = vote_url
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", exc_count)
            vote.set_count("absent", absent_count)
            vote.set_count("abstain", abstain_count)
            vote.add_source(vote_url)

            rolls = page.xpath("//tr[preceding-sibling::tr/descendant::"
                               "td/div/b/font[contains(text(),'Vote')]]")

            vote_abrv = {
                "Y": "yes",
                "N": "no",
                "E": "excused",
                "A": "absent",
                "-": "absent",
                "17C": "abstain",
            }
            for roll in rolls:
                if len(roll.xpath(".//td/div/font/text()")) > 0:
                    voted = roll.xpath(".//td/div/font/text()")[0].strip()
                    voter = roll.xpath(".//td/font/text()")[0].strip()
                    if voted == "V":
                        continue
                    vote.vote(vote_abrv[voted], voter)
            yield vote
Beispiel #18
0
    def scrape_votes(self, url, motion, date, chamber, bill):
        try:
            vote_pdf, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("Can't find vote file {}, skipping".format(url))
            return

        text = convert_pdf(vote_pdf, "text")
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []
        absent_votes = []
        not_voting_votes = []
        # point at array to add names to
        cur_array = None

        precursors = (
            ("yeas--", yes_votes),
            ("nays--", no_votes),
            ("absent or those not voting--", absent_votes),
            ("absent and those not voting--", absent_votes),
            ("not voting--", not_voting_votes),
            ("voting present--", other_votes),
            ("present--", other_votes),
            ("disclaimer", None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.decode().split("\n"))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line.lower():
                    cur_array = arr
                    line = line.replace(pc, "")

            # split names
            for name in line.split(","):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if "None." in name:
                    cur_array = None

                match = re.match(r"(.+?)\. Total--.*", name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in (
                        "on final passage",
                        "Necessary",
                        "who would have",
                        "being a tie",
                        "therefore",
                        "Vacancies",
                        "a pair",
                        "Total-",
                        "ATTORNEY",
                        "on final passage",
                        "SPEAKER",
                        "BOARD",
                        "TREASURER",
                        "GOVERNOR",
                        "ARCHIVES",
                        "SECRETARY",
                ):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == ".":
                        name = name[:-1]
                    name = self.clean_voter_name(name)
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        absent_count = len(absent_votes)
        not_voting_count = len(not_voting_votes)
        other_count = len(other_votes)

        vote = VoteEvent(
            chamber=chamber,
            start_date=self._tz.localize(date),
            motion_text=motion,
            result="pass" if passed else "fail",
            classification="passage",
            bill=bill,
        )
        vote.dedupe_key = url + "#" + bill.identifier

        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("absent", absent_count)
        vote.set_count("not voting", not_voting_count)
        vote.set_count("other", other_count)
        vote.add_source(url)
        for yes_vote in yes_votes:
            vote.vote("yes", self.clean_voter_name(yes_vote))
        for no_vote in no_votes:
            vote.vote("no", self.clean_voter_name(no_vote))
        for absent_vote in absent_votes:
            vote.vote("absent", self.clean_voter_name(absent_vote))
        for not_voting_vote in not_voting_votes:
            vote.vote("not voting", self.clean_voter_name(not_voting_vote))
        for other_vote in other_votes:
            vote.vote("other", self.clean_voter_name(other_vote))
        yield vote
Beispiel #19
0
    def scrape_house_vote(self, url):
        page = lxml.html.fromstring(self.get(url).content)
        page.make_links_absolute(url)

        vote_date = page.xpath(
            "//rollcall-vote/vote-metadata/action-date/text()")[0]
        vote_time = page.xpath(
            "//rollcall-vote/vote-metadata/action-time/@time-etz")[0]

        when = self._TZ.localize(
            datetime.datetime.strptime("{} {}".format(vote_date, vote_time),
                                       "%d-%b-%Y %H:%M"))

        motion = page.xpath(
            "//rollcall-vote/vote-metadata/vote-question/text()")[0]
        result = page.xpath(
            "//rollcall-vote/vote-metadata/vote-result/text()")[0]
        if result == "Passed":
            result = "pass"
        else:
            result = "fail"

        session = page.xpath(
            "//rollcall-vote/vote-metadata/congress/text()")[0]

        bill_id = page.xpath(
            "//rollcall-vote/vote-metadata/legis-num/text()")[0]

        # for some reason these are "H R 123" which nobody uses, so fix to "HR 123"
        bill_id = re.sub(r"([A-Z])\s([A-Z])", r"\1\2", bill_id)

        roll_call = page.xpath(
            "//rollcall-vote/vote-metadata/rollcall-num/text()")[0]

        vote_id = "us-{}-lower-{}".format(when.year, roll_call)

        vote = VoteEvent(
            start_date=when,
            bill_chamber="lower" if bill_id[0] == "H" else "upper",
            motion_text=motion,
            classification="passage",  # TODO
            result=result,
            legislative_session=session,
            identifier=vote_id,
            bill=bill_id,
            chamber="lower",
        )
        vote.add_source(url)

        vote.extras["house-rollcall-num"] = roll_call

        yeas = page.xpath(
            "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/yea-total/text()"
        )[0]
        nays = page.xpath(
            "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/nay-total/text()"
        )[0]
        nvs = page.xpath(
            "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/not-voting-total/text()"
        )[0]
        presents = page.xpath(
            "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/present-total/text()"
        )[0]

        vote.set_count("yes", int(yeas))
        vote.set_count("no", int(nays))
        vote.set_count("not voting", int(nvs))
        vote.set_count("abstain", int(presents))

        # vote.yes vote.no vote.vote
        for row in page.xpath("//rollcall-vote/vote-data/recorded-vote"):
            bioguide = row.xpath("legislator/@name-id")[0]
            name = row.xpath("legislator/@sort-field")[0]
            choice = row.xpath("vote/text()")[0]

            vote.vote(self.vote_codes[choice], name, note=bioguide)
        return vote
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["identifier"]
            self.info("no session specified, using %s", session)
        chamber_types = {
            "H": "lower",
            "S": "upper",
            "G": "executive",
            "C": "legislature",
        }

        # pull the current session's details to tell if it's a special
        session_details = next(
            each for each in self.jurisdiction.legislative_sessions
            if each["identifier"] == session)

        is_special = False
        if ("classification" in session_details
                and session_details["classification"] == "special"):
            is_special = True

        session_id = SESSION_SITE_IDS[session]
        self.init_sftp(session_id)
        bill_url_base = "https://lis.virginia.gov/cgi-bin/"

        if not is_special:
            self.load_members()
            self.load_sponsors()
            self.load_fiscal_notes()
            self.load_summaries()
        self.load_history()
        self.load_votes()
        self.load_bills()

        if not is_special:
            self.load_amendments()

        for bill in self._bills:
            bill = self._bills[bill][0]

            bill_id = bill["bill_id"]
            chamber = chamber_types[bill_id[0]]
            bill_type = {
                "B": "bill",
                "J": "joint resolution",
                "R": "resolution"
            }[bill_id[1]]
            b = Bill(
                bill_id,
                session,
                bill["bill_description"],
                chamber=chamber,
                classification=bill_type,
            )
            bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}"
            b.add_source(bill_url)

            # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries.
            # Fill in blanks with 0s
            long_bill_id = bill_id
            if len(bill_id) == 3:
                long_bill_id = bill_id[0:2] + "000" + bill_id[-1]
            elif len(bill_id) == 4:
                long_bill_id = bill_id[0:2] + "00" + bill_id[-2:]
            elif len(bill_id) == 5:
                long_bill_id = bill_id[0:2] + "0" + bill_id[-3:]

            # Sponsors
            if long_bill_id not in self._sponsors:
                if "patron_name" in bill and bill["patron_name"].strip() != "":
                    b.add_sponsorship(
                        bill["patron_name"],
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )
            for spon in self._sponsors[long_bill_id]:
                if spon["member_name"].strip() == "":
                    continue

                sponsor_type = spon["patron_type"]
                if sponsor_type.endswith("Chief Patron"):
                    sponsor_type = "primary"
                else:
                    sponsor_type = "cosponsor"
                b.add_sponsorship(
                    spon["member_name"],
                    classification=sponsor_type,
                    entity_type="person",
                    primary=sponsor_type == "primary",
                )

            # Summary
            summary_texts = self._summaries[long_bill_id]
            for sum_text in summary_texts:
                b.add_abstract(sum_text["summary_text"],
                               sum_text["summary_type"])

            # Amendment docs
            amendments = self._amendments[bill_id]
            for amend in amendments:
                doc_link = (
                    bill_url_base +
                    f"legp604.exe?{session_id}+amd+{amend['txt_docid']}")
                b.add_document_link("Amendment: " + amend["txt_docid"],
                                    doc_link,
                                    media_type="text/html")

            # fiscal notes
            for fn in self._fiscal_notes[long_bill_id]:
                doc_link = bill_url_base + f"legp604.exe?{session_id}+oth+{fn['refid']}"
                b.add_document_link(
                    "Fiscal Impact Statement: " + fn["refid"],
                    doc_link.replace(".PDF", "+PDF"),
                    media_type="application/pdf",
                )

            # actions with 8-digit number followed by D are version titles too
            doc_actions = defaultdict(list)
            # History and then votes
            for hist in self._history[bill_id]:
                action = hist["history_description"]
                action_date = hist["history_date"]
                date = datetime.datetime.strptime(action_date,
                                                  "%m/%d/%y").date()
                chamber = chamber_types[action[0]]
                vote_id = hist["history_refid"]
                cleaned_action = action[2:]

                if re.findall(r"\d{8}D", cleaned_action):
                    doc_actions[action_date].append(cleaned_action)

                # categorize actions
                for pattern, atype in ACTION_CLASSIFIERS:
                    if re.match(pattern, cleaned_action):
                        break
                else:
                    atype = None

                if atype != SKIP:
                    b.add_action(cleaned_action,
                                 date,
                                 chamber=chamber,
                                 classification=atype)

                if len(vote_id) > 0:
                    total_yes = 0
                    total_no = 0
                    total_not_voting = 0
                    total_abstain = 0
                    for v in self._votes[vote_id]:
                        if v["vote_result"] == "yes":
                            total_yes += 1
                        elif v["vote_result"] == "no":
                            total_no += 1
                        elif v["vote_result"] == "not voting":
                            total_not_voting += 1
                        elif v["vote_result"] == "abstain":
                            total_abstain += 1
                    vote = VoteEvent(
                        identifier=vote_id,
                        start_date=date,
                        chamber=chamber,
                        motion_text=cleaned_action,
                        result="pass" if total_yes > total_no else "fail",
                        classification="passage",
                        bill=b,
                    )
                    vote.set_count("yes", total_yes)
                    vote.set_count("no", total_no)
                    vote.set_count("not voting", total_not_voting)
                    vote.set_count("abstain", total_abstain)

                    vote_url = (
                        bill_url_base +
                        f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}"
                    )
                    vote.add_source(vote_url)
                    for v in self._votes[vote_id]:
                        vote.vote(v["vote_result"], v["member_id"])
                    yield vote

            # Versions
            for version in bill["text_docs"]:
                # Checks if abbr is blank as not every bill has multiple versions
                if version["doc_abbr"]:
                    version_url = (
                        bill_url_base +
                        f"legp604.exe?{session_id}+ful+{version['doc_abbr']}")

                    version_date = datetime.datetime.strptime(
                        version["doc_date"], "%m/%d/%y").date()
                    # version text will default to abbreviation provided in CSV
                    # but if there is an unambiguous action from that date with
                    # a version, we'll use that as the document title
                    version_text = version["doc_abbr"]
                    if len(doc_actions[version["doc_date"]]) == 1:
                        version_text = doc_actions[version["doc_date"]][0]
                    b.add_version_link(
                        version_text,
                        version_url,
                        date=version_date,
                        media_type="text/html",
                        on_duplicate="ignore",
                    )

            yield b
Beispiel #21
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r"<.*?>"

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format(
            year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)["data"] or []

        bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format(
            year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)["data"] or [])

        resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format(
            year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)["data"] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info["BillNumber"].startswith("J.R.H."):
                bill_type = "joint resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("J.R.S."):
                bill_type = "joint resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("H.C.R."):
                bill_type = "concurrent resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S.C.R."):
                bill_type = "concurrent resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("H.R."):
                bill_type = "resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S.R."):
                bill_type = "resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("PR."):
                bill_type = "constitutional amendment"
                if info["Body"] == "H":
                    bill_chamber = "lower"
                elif info["Body"] == "S":
                    bill_chamber = "upper"
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info["BillNumber"].startswith("H."):
                bill_type = "bill"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S."):
                bill_type = "bill"
                bill_chamber = "upper"

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info["BillNumber"]))

            bill_id_original_format = (info["BillNumber"].replace(".",
                                                                  "").replace(
                                                                      " ", ""))

            bill_id = bill_id_original_format

            # put one space back in between type and number
            bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info["Title"],
                classification=bill_type,
            )
            if "resolution" in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format(
                year_slug, info["BillNumber"])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                "following-sibling::dd[1]/ul/li")
            sponsor_type = "primary"
            for sponsor in sponsors:
                if sponsor.xpath("span/text()") == ["Additional Sponsors"]:
                    sponsor_type = "cosponsor"
                    continue

                sponsor_name = (sponsor.xpath("a/text()")[0].replace(
                    "Rep.", "").replace("Sen.", "").strip())
                if sponsor_name and not (sponsor_name[:5] == "Less"
                                         and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type="person",
                        primary=(sponsor_type == "primary"),
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                "following-sibling::dd[1]/ul/li/a |"
                '//ul[@class="bill-path"]//a')

            for version in versions:
                if version.xpath("text()"):
                    bill.add_version_link(
                        note=version.xpath("text()")[0],
                        url=version.xpath("@href")[0].replace(" ", "%20"),
                        media_type="application/pdf",
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode("utf-8"),
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(
                    info["BillNumber"]))
                yield bill
                continue

            # Capture actions
            actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format(
                year_slug, internal_bill_id)
            actions_json = self.get(actions_url)

            # Checks if page actually has json posted
            if "json" in actions_json.headers.get("Content-Type"):
                actions = json.loads(actions_json.text)["data"]
                # Checks to see if any data is actually there
                if actions == "":
                    continue
            else:
                continue
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v for k, v in action.items() if v is not None}

                if "Signed by Governor" in action["FullStatus"]:
                    actor = "executive"
                elif action["ChamberCode"] == "H":
                    actor = "lower"
                elif action["ChamberCode"] == "S":
                    actor = "upper"
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action["FullStatus"]:
                    # assert chambers_passed == set("HS")
                    action_type = "executive-signature"
                elif "Vetoed by the Governor" in action["FullStatus"]:
                    action_type = "executive-veto"
                elif ("Read first time" in action["FullStatus"]
                      or "Read 1st time" in action["FullStatus"]):
                    action_type = "introduction"
                elif "Reported favorably" in action["FullStatus"]:
                    action_type = "committee-passage-favorable"
                elif actor == "lower" and any(
                        x.lower().startswith("aspassed")
                        for x in action["keywords"].split(";")):
                    action_type = "passage"
                    chambers_passed.add("H")
                elif actor == "upper" and any(
                        x.lower().startswith(" aspassed")
                        or x.lower().startswith("aspassed")
                        for x in action["keywords"].split(";")):
                    action_type = "passage"
                    chambers_passed.add("S")
                else:
                    action_type = None

                # Manual fix for data error in
                # https://legislature.vermont.gov/bill/status/2020/H.511
                action["StatusDate"] = action["StatusDate"].replace(
                    "/0209", "/2019")

                # Manual fix for data error in
                # https://legislature.vermont.gov/bill/status/2020/H.754
                if bill_id == "H 754" and session == "2019-2020":
                    action["StatusDate"] = action["StatusDate"].replace(
                        "/0202", "/2020")

                # https://legislature.vermont.gov/bill/status/2020/H.942
                if bill_id == "H 942" and session == "2019-2020":
                    action["StatusDate"] = action["StatusDate"].replace(
                        "/0200", "/2020")

                action_date = datetime.datetime.strftime(
                    datetime.datetime.strptime(action["StatusDate"],
                                               "%m/%d/%Y"),
                    "%Y-%m-%d",
                )
                # strftime doesn't always pad year value (%Y)  (https://bugs.python.org/issue32195)
                # and sometimes this state has typos in year part of the StatusDate value
                # which can cause validation errors, so fix leading zeroes if they are missing
                if action_date.find("-") < 4:
                    action_date = ("0" *
                                   (4 - action_date.find("-"))) + action_date

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]),
                    date=action_date,
                    chamber=actor,
                    classification=action_type,
                )

            # Capture votes
            votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)["data"]
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote["VoteHeaderID"]
                roll_call_url = ("http://legislature.vermont.gov/bill/"
                                 "loadBillRollCallDetails/{0}/{1}".format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)["data"]

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name,
                     _district) = member["MemberName"].split(" of ")
                    member_name = member_name.strip()

                    if member["MemberVote"] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member["MemberVote"] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote["FullStatus"]
                        # seems like we've seen both
                        or "Governor overridden" in vote["FullStatus"] or
                        "Governor overriden" in vote["FullStatus"]):
                    did_pass = True
                elif ("Failed -- " in vote["FullStatus"] or
                      "Veto of the Governor sustained" in vote["FullStatus"]):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear: " +
                                         vote["FullStatus"])

                # Check vote counts
                yea_count = int(
                    re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1))
                nay_count = int(
                    re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1))

                vote_start_date = datetime.datetime.strftime(
                    datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"),
                    "%Y-%m-%d",
                )
                motion_text = re.sub(HTML_TAGS_RE, "",
                                     vote["FullStatus"]).strip()
                vote_identifer = (vote["StatusDate"] + "--" + motion_text +
                                  "--" + roll_call_url)
                vote_to_add = VoteEvent(
                    identifier=vote_identifer,
                    bill=bill,
                    chamber=("lower"
                             if vote["ChamberCode"] == "H" else "upper"),
                    start_date=vote_start_date,
                    motion_text=motion_text,
                    result="pass" if did_pass else "fail",
                    classification="passage",
                    legislative_session=session,
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count("yes", yea_count)
                vote_to_add.set_count("no", nay_count)
                vote_to_add.set_count("not voting", len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote("not voting", member)

                yield vote_to_add

            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            witnesses_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/witnesses".format(
                bill_id_original_format)
            bill.add_document_link(note="Witness List",
                                   url=witnesses_doc_link_url,
                                   media_type="text/html")

            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            conferees_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/conference".format(
                bill_id_original_format)
            page = self.lxmlize(conferees_doc_link_url)
            no_data = page.xpath('//div[@class="no-data"]/text()')
            if not no_data:
                bill.add_document_link(
                    note="Conference Committee Members",
                    url=conferees_doc_link_url,
                    media_type="text/html",
                )

            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}
            meetings_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/meetings".format(
                bill_id_original_format)
            bill.add_document_link(
                note="Committee Meetings",
                url=meetings_doc_link_url,
                media_type="text/html",
            )

            yield bill
Beispiel #22
0
    def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date,
                    action_text):
        url = ("http://alisondb.legislature.state.al.us/Alison/"
               "GetRollCallVoteResults.aspx?"
               "VOTE={0}&BODY={1}&INST={2}&SESS={3}".format(
                   vote_id, vote_chamber, bill_id, self.session_id))
        doc = lxml.html.fromstring(self.get(url=url).text)

        voters = {"Y": [], "N": [], "P": [], "A": []}

        voters_and_votes = doc.xpath("//table/tr/td/font/text()")
        capture_vote = False
        name = ""
        for item in voters_and_votes:
            if capture_vote:
                capture_vote = False
                if name:
                    voters[item].append(name)
            else:
                capture_vote = True
                name = item
                if (name.endswith(", Vacant") or name.startswith("Total ")
                        or not name.strip()):
                    name = ""

        # Check name counts against totals listed on the site
        total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()')
        if total_yea:
            total_yea = int(total_yea[0].split(":")[-1])
            assert total_yea == len(voters["Y"]), "Yea count incorrect"
        else:
            total_yea = len(voters["Y"])

        total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()')
        if total_nay:
            total_nay = int(total_nay[0].split(":")[-1])
            assert total_nay == len(voters["N"]), "Nay count incorrect"
        else:
            total_nay = len(voters["N"])

        total_absent = doc.xpath(
            '//*[starts-with(text(), "Total Absent")]/text()')
        if total_absent:
            total_absent = int(total_absent[0].split(":")[-1])
            assert total_absent == len(voters["A"]), "Absent count incorrect"
        total_other = len(voters["P"]) + len(voters["A"])

        vote = VoteEvent(
            chamber=self.CHAMBERS[vote_chamber[0]],
            start_date=vote_date,
            motion_text=action_text,
            result="pass" if total_yea > total_nay else "fail",
            classification="passage",
            bill=bill,
        )
        vote.set_count("yes", total_yea)
        vote.set_count("no", total_nay)
        vote.set_count("other", total_other)
        vote.add_source(url)
        for member in voters["Y"]:
            vote.vote("yes", member)
        for member in voters["N"]:
            vote.vote("no", member)
        for member in voters["A"] + voters["P"]:
            vote.vote("other", member)

        yield vote
Beispiel #23
0
    def handle_page(self):
        MOTION_INDEX = 4
        TOTALS_INDEX = 6
        VOTE_START_INDEX = 9

        if len(self.lines) < 2:
            self.scraper.warning("Bad PDF! " + self.url)
            return

        motion = self.lines[MOTION_INDEX].strip()
        # Sometimes there is no motion name, only "Passage" in the line above
        if not motion and not self.lines[MOTION_INDEX -
                                         1].startswith("Calendar Page:"):
            motion = self.lines[MOTION_INDEX - 1]
            MOTION_INDEX -= 1
            TOTALS_INDEX -= 1
            VOTE_START_INDEX -= 1
        else:
            assert motion, "Floor vote's motion name appears to be empty"

        for _extra_motion_line in range(2):
            MOTION_INDEX += 1
            if self.lines[MOTION_INDEX].strip():
                motion = "{}, {}".format(motion,
                                         self.lines[MOTION_INDEX].strip())
                TOTALS_INDEX += 1
                VOTE_START_INDEX += 1
            else:
                break

        (yes_count, no_count, nv_count) = [
            int(x) for x in re.search(
                r"^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$",
                self.lines[TOTALS_INDEX],
            ).groups()
        ]
        result = "pass" if yes_count > no_count else "fail"

        vote = VoteEvent(
            start_date=self.kwargs["date"],
            chamber=self.kwargs["chamber"],
            bill=self.kwargs["bill"],
            motion_text=motion,
            result=result,
            classification="passage",
        )
        vote.add_source(self.url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("not voting", nv_count)

        for line in self.lines[VOTE_START_INDEX:]:
            if not line.strip():
                break

            if " President " in line:
                line = line.replace(" President ", " ")
            elif " Speaker " in line:
                line = line.replace(" Speaker ", " ")

            # Votes follow the pattern of:
            # [vote code] [member name]-[district number]
            for vtype, member in re.findall(
                    r"\s*(Y|N|EX|AV)\s+(.*?)-\d{1,3}\s*", line):
                vtype = {
                    "Y": "yes",
                    "N": "no",
                    "EX": "excused",
                    "AV": "abstain"
                }[vtype]
                member = member.strip()
                vote.vote(vtype, member)

        # check totals line up
        yes_count = no_count = nv_count = 0
        for vc in vote.counts:
            if vc["option"] == "yes":
                yes_count = vc["value"]
            elif vc["option"] == "no":
                no_count = vc["value"]
            else:
                nv_count += vc["value"]

        for vr in vote.votes:
            if vr["option"] == "yes":
                yes_count -= 1
            elif vr["option"] == "no":
                no_count -= 1
            else:
                nv_count -= 1

        if yes_count != 0 or no_count != 0:
            raise ValueError("vote count incorrect: " + self.url)

        if nv_count != 0:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.scraper.info(
                "Votes don't add up; looking for additional ones")
            for line in self.lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(r"\s{8,}([A-Z][a-z\'].*?)-\d{1,3}",
                                         line):
                    member = member.strip()
                    vote.vote("not voting", member)
        yield vote
    def _scrape_upper_chamber(self, session):
        if int(session[:4]) >= 2016:
            if len(session) == 4:
                # regular session
                url = "http://www.senate.mo.gov/%sinfo/jrnlist/default.aspx" % (
                    session[-2:], )
            else:
                # special session
                url = "http://www.senate.mo.gov/%sinfo/jrnlist/%sJournals.aspx" % (
                    session[-4:-2],
                    session[-2:],
                )
        else:
            url = "http://www.senate.mo.gov/%sinfo/jrnlist/journals.aspx" % (
                session[-2:])

        vote_types = {
            "YEAS": "yes",
            "NAYS": "no",
            "Absent with leave": "other",
            "Absent": "other",
            "Vacancies": "other",
        }

        page = self.lxmlize(url)
        journs = page.xpath("//table")[0].xpath(".//a")
        for a in journs:
            pdf_url = a.attrib["href"]
            data = self._get_pdf(pdf_url).decode()
            lines = data.split("\n")

            in_vote = False
            cur_date = None
            vote_type = "other"
            cur_bill = ""
            cur_motion = ""
            bc = None
            vote = {}
            counts = collections.defaultdict(int)

            for line in lines:
                line = line.strip()

                if cur_date is None:
                    matches = re.findall(date_re, line)
                    if matches != []:
                        date = matches[0]
                        date = "%s, %s %s, %s" % date
                        date = dt.datetime.strptime(date, "%A, %B %d, %Y")
                        cur_date = date

                matches = re.findall(motion_re, line)
                if matches != []:
                    cont = False
                    for x in matches:
                        if "vote" in x.lower():
                            cur_motion = x
                            bill = re.findall(bill_re, x)
                            if bill != []:
                                bc = {
                                    "H": "lower",
                                    "S": "upper",
                                    "J": "legislature"
                                }[bill[0][0]]

                                cur_bill = "%s%s%s %s" % bill[0]
                            in_vote = True
                            cont = True
                    if cont:
                        continue
                if in_vote:
                    if is_vote_end(line):
                        in_vote = False
                        yes, no, other = counts["yes"], counts["no"], counts[
                            "other"]
                        if bc is None:
                            continue

                        v = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            motion_text=cur_motion,
                            result="pass" if yes > no else "fail",
                            legislative_session=session,
                            classification="passage",
                            bill=cur_bill,
                            bill_chamber=bc,
                        )

                        v.add_source(url)
                        v.add_source(pdf_url)

                        v.set_count("yes", yes)
                        v.set_count("no", no)
                        v.set_count("other", other)

                        for key in vote:
                            for person in vote[key]:
                                v.vote(key, person)

                        yield v
                        vote = {}
                        counts = collections.defaultdict(int)
                        continue
                    if "Journal of the Senate" in line:
                        continue
                    if re.match(
                            r".*(Monday|Tuesday|Wednesday|Thursday|Friday|"
                            r"Saturday|Sunday), .* \d+, \d+.*",
                            line,
                    ):
                        continue

                    found = False
                    rl = None
                    for vote_type in list(vote_types):
                        if line.lower().startswith(vote_type.lower()):
                            if "none" in line.lower():
                                continue

                            if "Senator" in line and "Senators" not in line:
                                line = self._clean_line(line)
                                line = line[len(vote_type):]
                                line = line.replace("-Senator ", "")
                                rl = line
                            vote_category = vote_types[vote_type]
                            found = True
                            if vote_category not in vote:
                                vote[vote_category] = []
                    if found and rl is None:
                        continue
                    elif rl:
                        line = rl

                    names = [self._clean_line(x) for x in line.strip().split()]
                    if names == []:
                        continue

                    lname = names[-1]
                    lname = lname.rsplit("-", 1)
                    if len(lname) > 1:
                        person, count = lname
                        if count.isdigit() is False:
                            continue

                        names.pop(-1)
                        names.append(person)
                        counts[vote_category] += int(count)

                    for name in names:
                        vote[vote_category].append(name)
Beispiel #25
0
    def handle_page(self):
        # Checks to see if any vote totals are provided
        if (len(
                self.doc.xpath(
                    '//span[contains(@id, "ctl00_MainContent_lblTotal")]/text()'
                )) > 0):
            (date,
             ) = self.doc.xpath('//span[contains(@id, "lblDate")]/text()')
            date = format_datetime(
                datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p"),
                "US/Eastern")
            # ctl00_MainContent_lblTotal //span[contains(@id, "ctl00_MainContent_lblTotal")]
            yes_count = int(
                self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0])
            no_count = int(
                self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0])
            other_count = int(
                self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0])
            result = "pass" if yes_count > no_count else "fail"

            (committee,
             ) = self.doc.xpath('//span[contains(@id, "lblCommittee")]/text()')
            (action,
             ) = self.doc.xpath('//span[contains(@id, "lblAction")]/text()')
            motion = "{} ({})".format(action, committee)

            vote = VoteEvent(
                start_date=date,
                bill=self.kwargs["bill"],
                chamber="lower",
                motion_text=motion,
                result=result,
                classification="committee",
            )
            vote.add_source(self.url)
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("not voting", other_count)

            for member_vote in self.doc.xpath(
                    '//ul[contains(@class, "vote-list")]/li'):
                if not member_vote.text_content().strip():
                    continue

                (member, ) = member_vote.xpath("span[2]//text()")
                (member_vote, ) = member_vote.xpath("span[1]//text()")

                member = member.strip()
                if member_vote == "Y":
                    vote.yes(member)
                elif member_vote == "N":
                    vote.no(member)
                elif member_vote == "-":
                    vote.vote("not voting", member)
                # Parenthetical votes appear to not be counted in the
                # totals for Yea, Nay, _or_ Missed
                elif re.search(r"\([YN]\)", member_vote):
                    continue
                else:
                    raise ValueError(
                        "Unknown vote type found: {}".format(member_vote))

            yield vote
    def scrape_votes(self, session, zip_url):
        votes = {}
        last_line = []

        for line in self.zf.open("tblrollcallsummary.txt"):
            if line.strip() == "":
                continue

            line = line.split("|")
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning("used bad vote line")
                else:
                    last_line = line
                    self.warning("bad vote line %s" % "|".join(line))
            session_yr = line[0]
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            # present = int(line[7])
            # absent = int(line[8])
            motion = line[11].strip() or "[not available]"

            if session_yr == session and bill_id in self.bills_by_id:
                actor = "lower" if body == "H" else "upper"
                time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p")
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(
                    chamber=actor,
                    start_date=time.strftime("%Y-%m-%d"),
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    classification="passage",
                    bill=self.bills_by_id[bill_id],
                )
                vote.set_count("yes", yeas)
                vote.set_count("no", nays)
                vote.add_source(zip_url)
                votes[body + vote_num] = vote

        for line in self.zf.open("tblrollcallhistory.txt"):
            # 2012    | H   | 2    | 330795  | HB309  | Yea |1/4/2012 8:27:03 PM
            session_yr, body, v_num, employee, bill_id, vote, date = line.split("|")

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = self.legislators[employee]["name"]
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body + v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body + v_num))
                    continue
                other_count = 0
                # code = self.legislators[employee]['seat']
                if vote == "Yea":
                    votes[body + v_num].yes(leg)
                elif vote == "Nay":
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].other(leg)
                    other_count += 1
                votes[body + v_num].set_count("other", other_count)
        for vote in votes.values():
            yield vote
Beispiel #27
0
    def scrape_vote(self, url, session):
        fname, _ = self.urlretrieve(url)
        text = convert_pdf(fname, type="text").decode()
        lines = text.splitlines()

        chamber = "upper" if "senate" in url else "lower"
        if "Maryland" not in text:
            self.warning(f"empty vote from {url}")
            return
        date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0]

        section = "preamble"
        motion = None
        bill_id = None
        how = None
        voters = defaultdict(list)

        for line in lines:
            if section == "preamble":
                if "vetoed" in line.lower():
                    self.warning(
                        f"skipping vote that appears to be on prior session: {line}, {bill_id}"
                    )
                    return
                possible_bill_id = re.findall(r"([HS][BJR] \d+)", line)
                if possible_bill_id:
                    bill_id = possible_bill_id[0]

                # preamble has metadata, then motion, then counts.  our process then is to
                # store the last line as the motion, but if the last line looks like a
                # continuation, append it to the prior line

                line = line.strip()
                counts = re.findall(
                    r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent",
                    line,
                )
                if counts:
                    yes_count, no_count, nv_count, excused_count, absent_count = counts[
                        0]
                    yes_count = int(yes_count)
                    no_count = int(no_count)
                    nv_count = int(nv_count)
                    excused_count = int(excused_count)
                    absent_count = int(absent_count)
                    section = "votes"
                elif line and line != "(Const)":
                    # questions seem to be split across two lines
                    if line.endswith("?"):
                        motion = motion + " " + line
                    else:
                        motion = line
            elif section == "votes":
                if line.startswith("Voting Yea"):
                    how = "yes"
                elif line.startswith("Voting Nay"):
                    how = "no"
                elif line.startswith("Not Voting"):
                    how = "not voting"
                elif line.startswith("Excused from Voting"):
                    how = "excused"
                elif line.startswith("Excused (Absent)"):
                    how = "absent"
                elif how:
                    names = re.split(r"\s{2,}", line)
                    voters[how].extend(names)

        if not bill_id and not motion:
            return
        elif bill_id and not motion:
            self.warning(
                f"got {bill_id} but no motion, not registering as a vote")
        elif motion and not bill_id:
            self.warning(
                f"got {motion} but no bill_id, not registering as a vote")
            return

        # bleh - result not indicated anywhere
        result = "pass" if yes_count > no_count else "fail"
        bill_chamber = "upper" if bill_id.startswith("S") else "lower"
        date = datetime.datetime.strptime(date,
                                          "%b %d, %Y").strftime("%Y-%m-%d")
        vote = VoteEvent(
            chamber=chamber,
            start_date=date,
            result=result,
            classification="passage",
            motion_text=motion,
            legislative_session=session,
            bill=bill_id,
            bill_chamber=bill_chamber,
        )
        # URL includes sequence ID, will be unique
        vote.dedupe_key = url
        vote.add_source(url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("not voting", nv_count)
        vote.set_count("excused", excused_count)
        vote.set_count("absent", absent_count)
        for how, names in voters.items():
            for name in names:
                name = name.strip().replace("*", "")
                if name and "COPY" not in name and "Indicates Vote Change" not in name:
                    vote.vote(how, name)
        check_counts(vote, raise_error=True)
        return vote
    def handle_page(self):
        summary = self.doc.xpath("/".join([
            '//h4[starts-with(text(), "SUMMARY")]',
            "/following-sibling::p",
            "text()",
        ]))
        if summary and summary[0].strip():
            self.obj.add_abstract(abstract=summary[0].strip(), note="summary")

        # versions
        for va in self.doc.xpath(
                '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'):

            # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D
            date, desc = va.text.split(u" \xa0")
            desc.rsplit(" ", 1)[0]  # chop off last part
            link = va.get("href")
            if "http" not in link:
                link = "{}{}".format(BASE_URL, link)
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            # budget bills in VA are searchable but no full text available
            if "+men+" in link:
                logging.getLogger("va").warning(
                    "not adding budget version, bill text not available")
            else:
                # VA duplicates reprinted bills, lets keep the original name
                self.obj.add_version_link(desc,
                                          link,
                                          date=date,
                                          media_type="text/html",
                                          on_duplicate="ignore")

        # amendments
        for va in self.doc.xpath(
                '//h4[text()="AMENDMENTS"]/following-sibling::ul[1]/li/a[1]'):
            version_name = va.xpath("string(.)")
            if (("adopted" in version_name.lower()
                 or "engrossed" in version_name.lower())
                    and "not adopted" not in version_name.lower()
                    and "not engrossed" not in version_name.lower()):
                version_url = va.xpath("@href")[0]
                self.obj.add_version_link(
                    version_name,
                    version_url,
                    media_type="text/html",
                    on_duplicate="ignore",
                )

        # actions
        seen_next = False
        for ali, next_ali in pairwise(
                self.doc.xpath(
                    '//h4[text()="HISTORY"]/following-sibling::ul[1]/li')):
            # If we've used this action text before, we don't need to parse it again
            if seen_next:
                seen_next = False
                continue
            date, action = ali.text_content().split(u" \xa0")
            try:
                actor, action = action.split(": ", 1)
            except ValueError:
                assert any([
                    action.startswith("{}:".format(x))
                    for x in self.actor_map.keys()
                ]), "Unparseable action text found: '{}'".format(action)
                logging.getLogger("va").warning(
                    "Skipping apparently-null action: '{}'".format(action))
                continue

            # Bill history entries purely in parentheses tend to be
            # notes and not actions, so we'll skip them.
            if action.startswith("(") and action.endswith(")"):
                continue

            actor = self.actor_map[actor]
            date = datetime.datetime.strptime(date.strip(), "%m/%d/%y").date()

            # if action ends in (##-Y ##-N) remove that part
            vrematch = self.vote_strip_re.match(action)
            # The following conditional logic is messy to handle
            # Virginia's crazy and inconsistently formatted bill
            # histories. Someone less harried and tired than me
            # could probably make this much cleaner. - alo
            if vrematch:
                vote_action, y, n, o = vrematch.groups()
                y = int(y)
                n = int(n)
                # Set default count for "other" votes to 0. We have to
                # do this explicitly as it's excluded from the action
                # text when there were no abstentions (the only type of
                # "other" vote encountered thus far).
                o = int(o) if o else 0

                vote_url = ali.xpath("a/@href")

                # Finds relevant information from the current action if
                # vote count encountered, then searches for the presence
                # of identical counts in the next entry (we assume that
                # it's probably there). If matching votes are found, it
                # merges data in both to create a unified vote record.
                #
                # This is because Virginia usually publishes two lines
                # of history data for a single vote, without guaranteed
                # order, so we unsafely attempt to match on identical
                # vote counts in the next line.
                vote = VoteEvent(
                    start_date=date,
                    chamber=actor,
                    motion_text=vote_action.strip(),
                    result="pass" if y > n else "fail",
                    classification="passage",
                    bill=self.obj,
                )
                vote.set_count("yes", y)
                vote.set_count("no", n)
                vote.set_count("other", o)

                try:
                    next_action = (
                        next_ali.text_content().split(" \xa0")[1].split(
                            ": ", 1)[1])
                except (AttributeError, ValueError):
                    next_action = ""

                vrematch_next = self.vote_strip_re.match(next_action)
                if vrematch_next:
                    vote_action_next, y_next, n_next, o_next = vrematch_next.groups(
                    )
                    y_next = int(y_next)
                    n_next = int(n_next)
                    o_next = int(o_next) if o_next else 0
                    vote_url_next = next_ali.xpath("a/@href")
                    # Check that the vote counts match and that only one action
                    # has a URL (otherwise, they're probably different votes).
                    if [y_next, n_next, o_next
                        ] == [y, n, o] and len(vote_url) != len(vote_url_next):
                        seen_next = True
                        if not vote_url:
                            vote_url = vote_url_next
                        else:
                            vote.motion_text = vote_action_next.strip()
                            action = next_action

                if vote_url:
                    list(
                        self.scrape_page_items(VotePage,
                                               url=vote_url[0],
                                               obj=vote))
                    vote.add_source(vote_url[0])
                else:
                    vote.add_source(self.url)

                yield from add_pupa_id(vote)

            # categorize actions
            for pattern, atype in ACTION_CLASSIFIERS:
                if re.match(pattern, action):
                    break
            else:
                atype = None

            # if matched a 'None' atype, don't add the action
            if atype != SKIP:
                self.obj.add_action(action,
                                    date,
                                    chamber=actor,
                                    classification=atype)
Beispiel #29
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type="text").decode()
        lines = text.splitlines()

        if "Senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        date_string = lines[0].split("Calendar Date:")[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if "Yeas" in line and "Nays" in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ["yes", "no", "not voting", "excused", "absent"]

        if page_index:

            counts = re.split(r"\s{2,}", lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(" ", 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ["Consent Calendar" in line for line in lines[:page_index]]
        )
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip())
            assert (
                consent_calendar_bills
            ), "Could not find bills for consent calendar vote"

        motion_keywords = [
            "favorable",
            "reading",
            "amendment",
            "motion",
            "introduced",
            "bill pass",
            "committee",
        ]
        motion_lines = [
            3,
            2,
            4,
            5,
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                break
            motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0]
        else:
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = "{}#{}".format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            "Voting Nay",
            "Not Voting",
            "COPY",
            "Excused",
            "indicates vote change",
            "Indicates Vote Change",
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or "Voting Yea" in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line for show_stopper in show_stoppers):
                page_index += 1
                vote_index = vote_index + 1
                continue

            names = re.split(r"\s{2,}", current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Beispiel #30
0
    def scrape_votes(self, bill, doc):
        vote_tr_path = ('//h6[@id="vote-header"]'
                        '/ancestor::div[contains(@class, "gray-card")]'
                        '//div[contains(@class, "card-body")]'
                        '//div[@class="row"]')

        for vote_row in doc.xpath(vote_tr_path):
            entries = [
                each.text_content() for each in vote_row.xpath("div")[1:-1:2]
            ]
            date, subject, rcs, aye, no, nv, abs, exc, total = entries
            result = vote_row.xpath("div/a")[0]
            result_text = result.text
            result_link = result.get("href")

            if "H" in rcs:
                chamber = "lower"
            elif "S" in rcs:
                chamber = "upper"

            date = eastern.localize(
                dt.datetime.strptime(date.replace(".", ""),
                                     "%m/%d/%Y %H:%M %p"))
            date = date.isoformat()

            ve = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=subject,
                result="pass" if "PASS" in result_text else "fail",
                bill=bill,
                classification="passage",  # TODO: classify votes
            )
            ve.set_count("yes", int(aye))
            ve.set_count("no", int(no))
            ve.set_count("not voting", int(nv))
            ve.set_count("absent", int(abs))
            ve.set_count("excused", int(exc))
            ve.add_source(result_link)

            data = self.get(result_link).text
            vdoc = lxml.html.fromstring(data)

            # only one table that looks like this
            vote_table = vdoc.xpath("//div[@class='row ncga-row-no-gutters']")

            # Grabs names for how people voted
            for row in vote_table:
                votes_names = []
                row = row.text_content()
                if "None" in row:
                    vote_type = "Nope"
                elif "Ayes (" in row:
                    row = row.replace("\n", ";")
                    votes_names = row.replace(" ", "").strip().split(";")[2:-1]
                    vote_type = "yes"
                elif "Noes (" in row:
                    row = row.replace("\n", ";")
                    votes_names = row.replace(" ", "").strip().split(";")[2:-1]
                    vote_type = "no"
                elif "Excused Absence (" in row:
                    row = row.replace("\n", ";")
                    votes_names = row.replace(" ", "").strip().split(";")[2:-1]
                    vote_type = "absent"
                elif "Not Voting (" in row:
                    row = row.replace("\n", ";")
                    votes_names = row.replace(" ", "").strip().split(";")[2:-1]
                    vote_type = "abstain"
                else:
                    vote_type = "Not a vote"
                if votes_names:
                    for name in votes_names:
                        name = name.replace("\r", "")
                        # Resolves names that have '(Chair)' in them
                        if "(" in name:
                            name = name[:name.find("(")]
                        # Adds a space to names inbetween initial and last name
                        # eg: L.Johnson -> L. Johnson
                        if name[1] == "." and name[2] != " ":
                            name = name[:2] + " " + name[2:]
                        ve.vote(vote_type, name)

            yield ve