Ejemplo n.º 1
0
    def scrape_committee_vote(self, bill, actor, date, motion, page, url,
                              uniqid):
        votes = page.xpath("//table")[0]
        rows = votes.xpath(".//tr")[0]
        if rows[0].text_content() == "Votes:":
            # New webste
            rows = votes.xpath(".//tr")[2]
        yno = rows.xpath(".//td")
        if len(yno) < 3:
            yes = yno[0]
            no, other = None, None
        else:
            yes, _, no, _, other = rows.xpath(".//td")[:5]

        def proc_block(obj, typ):
            if obj is None:
                return {"type": None, "count": None, "votes": []}
            votes = []
            for vote in obj.xpath("./text()"):
                if vote.strip():
                    vote = vote.strip()
                    if vote:
                        votes.append(vote)
            count = len(votes)
            return {"type": typ, "count": count, "votes": votes}

        vote_dict = {
            "yes": proc_block(yes, "yes"),
            "no": proc_block(no, "no"),
            "other": proc_block(other, "other"),
        }

        yes_count = vote_dict["yes"]["count"]
        no_count = vote_dict["no"]["count"] or 0
        other_count = vote_dict["other"]["count"] or 0
        vote = Vote(
            chamber=actor,
            start_date=date,
            motion_text=motion,
            identifier=str(uniqid),
            result="pass" if (yes_count > no_count) else "fail",
            classification="passage",
            bill=bill,
        )
        vote.extras = {"_vote_id": uniqid}
        vote.add_source(url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)
        for key in vote_dict:
            for voter in vote_dict[key]["votes"]:
                vote.vote(key, voter)

        yield vote
Ejemplo n.º 2
0
    def parse_vote(self, bill, actor, date, motion, url, uniqid):
        page = self.get(url).text
        bill.add_source(url)
        vote_re = re.compile(
            r"YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)"
            r"(.*)ABSENT( OR NOT VOTING)? -?\s?"
            r"(\d+)(.*)",
            re.MULTILINE | re.DOTALL,
        )
        match = vote_re.search(page)
        yes_count = int(match.group(1))
        no_count = int(match.group(3))
        other_count = int(match.group(6))

        if yes_count > no_count:
            passed = True
        else:
            passed = False

        if actor == "upper" or actor == "lower":
            vote_chamber = actor
        else:
            vote_chamber = ""

        vote = Vote(
            chamber=vote_chamber,
            start_date=date,
            motion_text=motion,
            result="pass" if passed else "fail",
            identifier=str(uniqid),
            classification="passage",
            bill=bill,
        )
        vote.add_source(url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)

        yes_votes = re.split(r"\s{2,}", match.group(2).strip())
        no_votes = re.split(r"\s{2,}", match.group(4).strip())
        other_votes = re.split(r"\s{2,}", match.group(7).strip())

        for yes in yes_votes:
            if yes:
                vote.yes(yes)
        for no in no_votes:
            if no:
                vote.no(no)
        for other in other_votes:
            if other:
                vote.vote("other", other)

        yield vote
Ejemplo n.º 3
0
    def scrape_votes(self, bill):
        bill_num = bill.identifier.split()[1]

        url = (
            "http://wslwebservices.leg.wa.gov/legislationservice.asmx/"
            "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium)
        )
        page = self.get(url)
        page = lxml.etree.fromstring(page.content)

        for rc in xpath(page, "//wa:RollCall"):
            motion = xpath(rc, "string(wa:Motion)")
            seq_no = xpath(rc, "string(wa:SequenceNumber)")

            date = xpath(rc, "string(wa:VoteDate)").split("T")[0]
            date = datetime.datetime.strptime(date, "%Y-%m-%d").date()

            yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)"))
            no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)"))
            abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)"))
            ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)"))

            other_count = abs_count + ex_count

            agency = xpath(rc, "string(wa:Agency)")
            chamber = {"House": "lower", "Senate": "upper"}[agency]

            vote = Vote(
                chamber=chamber,
                start_date=date,
                motion_text="{} (#{})".format(motion, seq_no),
                result="pass" if yes_count > (no_count + other_count) else "fail",
                bill=bill,
                classification=[],
            )
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("other", other_count)
            vote.add_source(url)
            for sv in xpath(rc, "wa:Votes/wa:Vote"):
                name = xpath(sv, "string(wa:Name)")
                vtype = xpath(sv, "string(wa:VOte)")

                if vtype == "Yea":
                    vote.yes(name)
                elif vtype == "Nay":
                    vote.no(name)
                else:
                    vote.vote("other", name)

            yield vote
Ejemplo n.º 4
0
    def parse_vote(self, chamber, bill, row, action_text, action_date, url):
        yes = int(
            row.xpath(
                './/div[label[contains(text(), "A Favor")]]/span[contains(@class,"smalltxt")]/text()'
            )[0]
        )
        no = int(
            row.xpath(
                './/div[label[contains(text(), "En Contra")]]/span[contains(@class,"smalltxt")]/text()'
            )[0]
        )
        abstain = int(
            row.xpath(
                './/div[label[contains(text(), "Abstenido")]]/span[contains(@class,"smalltxt")]/text()'
            )[0]
        )
        absent = int(
            row.xpath(
                './/div[label[contains(text(), "Ausente")]]/span[contains(@class,"smalltxt")]/text()'
            )[0]
        )

        vote_chamber = self.parse_vote_chamber(chamber, action_text)

        classification = "passage" if u"Votación Final" in action_text else "other"

        vote = Vote(
            chamber=vote_chamber,
            start_date=action_date,
            motion_text=action_text,
            result="pass" if (yes > no) else "fail",
            bill=bill,
            classification=classification,
        )
        vote.add_source(url)
        vote.set_count("yes", yes)
        vote.set_count("no", no)
        vote.set_count("absent", absent)
        vote.set_count("abstain", abstain)

        # we don't want to add the attached vote PDF as a version,
        # so add it as a document
        # TODO: maybe this should be set as the source?
        self.parse_version(bill, row, is_document=True)

        yield vote
Ejemplo n.º 5
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " "))

        seen_rcs = set()

        re_ns = "http://exslt.org/regular-expressions"
        path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={"re": re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if "HOUSE" in header.xpath("string()"):
                chamber = "lower"
                motion_index = 8
            else:
                chamber = "upper"
                motion_index = 13

            motion = header.xpath("string(following-sibling::p[%d])" %
                                  motion_index).strip()
            motion = re.sub(r"\s+", " ", motion)
            if not motion.strip():
                self.warning("Motion text not found")
                return
            match = re.match(r"^(.*) (PASSED|FAILED)$", motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == "PASSED"
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ")
            rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1)

            if rcs in seen_rcs:
                continue
            else:
                seen_rcs.add(rcs)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r"\d+/\d+/\d+", date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace("\r\n", " ").strip()
                if "*****" in line:
                    break
                regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL "
                         r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)")
                match = re.match(regex, line)
                if match:
                    if match.group(1) == "YEAS" and "RCS#" not in line:
                        vtype = "yes"
                        seen_yes = True
                    elif match.group(1) == "NAYS" and seen_yes:
                        vtype = "no"
                    elif match.group(1) == "VACANT":
                        continue  # skip these
                    elif seen_yes:
                        vtype = "other"
                    if seen_yes and match.group(3).strip():
                        self.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split("   "):
                        if not name:
                            continue
                        if "HOUSE" in name or "SENATE " in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts["yes"] > (counts["no"] + counts["other"])

            vote = Vote(
                chamber=chamber,
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.set_count("yes", counts["yes"])
            vote.set_count("no", counts["no"])
            vote.set_count("other", counts["other"])
            vote.dedupe_key = url + "#" + rcs

            vote.add_source(url)

            for name in votes["yes"]:
                vote.yes(name)
            for name in votes["no"]:
                if ":" in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes["other"]:
                vote.vote("other", name)

            yield vote
    def scrape_votes(self, session, zip_url):
        votes = {}
        last_line = []

        for line in self.zf.open("tblrollcallsummary.txt"):
            if line.strip() == "":
                continue

            line = line.split("|")
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning("used bad vote line")
                else:
                    last_line = line
                    self.warning("bad vote line %s" % "|".join(line))
            session_yr = line[0]
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            # present = int(line[7])
            # absent = int(line[8])
            motion = line[11].strip() or "[not available]"

            if session_yr == session and bill_id in self.bills_by_id:
                actor = "lower" if body == "H" else "upper"
                time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p")
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(
                    chamber=actor,
                    start_date=time.strftime("%Y-%m-%d"),
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    classification="passage",
                    bill=self.bills_by_id[bill_id],
                )
                vote.set_count("yes", yeas)
                vote.set_count("no", nays)
                vote.add_source(zip_url)
                votes[body + vote_num] = vote

        for line in self.zf.open("tblrollcallhistory.txt"):
            # 2012    | H   | 2    | 330795  | HB309  | Yea |1/4/2012 8:27:03 PM
            session_yr, body, v_num, employee, bill_id, vote, date = line.split("|")

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = self.legislators[employee]["name"]
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body + v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body + v_num))
                    continue
                other_count = 0
                # code = self.legislators[employee]['seat']
                if vote == "Yea":
                    votes[body + v_num].yes(leg)
                elif vote == "Nay":
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].other(leg)
                    other_count += 1
                votes[body + v_num].set_count("other", other_count)
        for vote in votes.values():
            yield vote
Ejemplo n.º 7
0
    def scrape_votes(self, session):
        votes = {}
        other_counts = defaultdict(int)
        last_line = []
        vote_url = "http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt"
        lines = self.get(vote_url).content.decode("utf-8").splitlines()

        for line in lines:

            if len(line) < 2:
                continue

            if line.strip() == "":
                continue

            line = line.split("|")
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning("used bad vote line")
                else:
                    last_line = line
                    self.warning("bad vote line %s" % "|".join(line))
            session_yr = line[0].replace("\xef\xbb\xbf", "")
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            # present = int(line[7])
            # absent = int(line[8])
            motion = line[11].strip() or "[not available]"

            if session_yr == session and bill_id in self.bills_by_id:
                actor = "lower" if body == "H" else "upper"
                time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p")
                time = pytz.timezone("America/New_York").localize(
                    time).isoformat()
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(
                    chamber=actor,
                    start_date=time,
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    classification="passage",
                    bill=self.bills_by_id[bill_id],
                )
                vote.set_count("yes", yeas)
                vote.set_count("no", nays)
                vote.add_source(vote_url)
                vote.pupa_id = session_yr + body + vote_num  # unique ID for vote
                votes[body + vote_num] = vote

        for line in (self.get(
                "http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt"
        ).content.decode("utf-8").splitlines()):
            if len(line) < 2:
                continue

            # 2016|H|2|330795||Yea|
            # 2012    | H   | 2    | 330795  | 964 |  HB309  | Yea | 1/4/2012 8:27:03 PM
            session_yr, body, v_num, _, employee, bill_id, vote, date = line.split(
                "|")

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = " ".join(self.legislators[employee]["name"].split())
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body + v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body + v_num))
                    continue
                # code = self.legislators[employee]['seat']

                if vote == "Yea":
                    votes[body + v_num].yes(leg)
                elif vote == "Nay":
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].vote("other", leg)
                    # hack-ish, but will keep the vote count sync'd
                    other_counts[body + v_num] += 1
                    votes[body + v_num].set_count("other",
                                                  other_counts[body + v_num])
        for vote in votes.values():
            yield vote
Ejemplo n.º 8
0
    def scrape_chamber(self, chamber, session):
        chamber_name = "house" if chamber == "lower" else "senate"
        session_slug = {
            "62": "62-2011",
            "63": "63-2013",
            "64": "64-2015",
            "65": "65-2017",
            "66": "66-2019",
        }[session]

        # Open the index page of the session's Registers, and open each
        url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % (
            session_slug,
            chamber_name,
        )
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            # Initialize information about the vote parsing
            results = {}
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""
            bills = []

            # Determine which URLs the information was pulled from
            pdf_url = pdf.attrib["href"]

            try:
                (path, response) = self.urlretrieve(pdf_url)
            except requests.exceptions.ConnectionError:
                continue

            # Convert the PDF to text
            data = convert_pdf(path, type="text").decode("utf-8")
            os.unlink(path)

            # Determine the date of the document
            date = re.findall(date_re, data)
            if date:
                date = date[0][0]
                cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y")
            else:
                # If no date is found anywhere, do not process the document
                self.warning("No date was found for the document; skipping.")
                continue

            # Check each line of the text for motion and vote information
            lines = data.splitlines()
            for line in lines:
                # Ignore lines with no information
                if (re.search(chamber_re, line) or re.search(date_re, line)
                        or re.search(page_re, line) or line.strip() == ""):
                    pass

                # Ensure that motion and vote capturing are not _both_ active
                elif in_motion and in_vote:
                    raise AssertionError(
                        "Scraper should not be simultaneously processing " +
                        "motion name and votes, as it is for this motion: " +
                        cur_motion)

                # Start capturing motion text after a ROLL CALL header
                elif not in_motion and not in_vote:
                    if line.strip() == "ROLL CALL":
                        in_motion = True

                elif in_motion and not in_vote:
                    if cur_motion == "":
                        cur_motion = line.strip()
                    else:
                        cur_motion = cur_motion + " " + line.strip()

                    # ABSENT AND NOT VOTING marks the end of each motion name
                    # In this case, prepare to capture votes
                    if line.strip().endswith(
                            "VOTING") or line.strip().endswith("VOTING."):
                        in_motion = False
                        in_vote = True

                elif not in_motion and in_vote:
                    # Ignore appointments and confirmations
                    if "The Senate advises and consents to the appointment" in line:
                        in_vote = False
                        cur_vote = None
                        results = {}
                        cur_motion = ""
                        bills = []

                    # If votes are being processed, record the voting members
                    elif ":" in line:
                        cur_vote, who = (x.strip() for x in line.split(":", 1))
                        who = [
                            x.strip() for x in who.split(";")
                            if x.strip() != ""
                        ]
                        results[cur_vote] = who

                        name_may_be_continued = False if line.endswith(
                            ";") else True

                    # Extracts bill numbers in the closing text
                    # used for when the closing text is multiple lines.
                    elif (cur_vote is not None
                          and re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)
                          and not any(x in line.lower() for x in [
                              "passed",
                              "adopted",
                              "sustained",
                              "prevailed",
                              "lost",
                              "failed",
                          ])):
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))

                    elif cur_vote is not None and not any(x in line.lower()
                                                          for x in [
                                                              "passed",
                                                              "adopted",
                                                              "sustained",
                                                              "prevailed",
                                                              "lost",
                                                              "failed",
                                                          ]):
                        who = [
                            x.strip() for x in line.split(";")
                            if x.strip() != ""
                        ]

                        if name_may_be_continued:
                            results[cur_vote][-1] = (results[cur_vote][-1] +
                                                     " " + who.pop(0))

                        name_may_be_continued = False if line.endswith(
                            ";") else True

                        results[cur_vote].extend(who)

                    # At the conclusion of a vote, save its data
                    elif any(x in line.lower() for x in [
                            "passed",
                            "adopted",
                            "sustained",
                            "prevailed",
                            "lost",
                            "failed",
                    ]):

                        in_vote = False
                        cur_vote = None

                        # Identify what is being voted on
                        # Throw a warning if impropper informaiton found
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))
                        if bills == [] or cur_motion.strip() == "":
                            results = {}
                            cur_motion = ""
                            self.warning("No motion or bill name found: " +
                                         "motion name: " + cur_motion + "; " +
                                         "decision text: " + line.strip())
                            continue

                        # If votes are found in the motion name, throw an error
                        if "YEAS:" in cur_motion or "NAYS:" in cur_motion:
                            raise AssertionError(
                                "Vote data found in motion name: " +
                                cur_motion)

                        # Use the collected results to determine who voted how
                        keys = {
                            "YEAS": "yes",
                            "NAYS": "no",
                            "ABSENT AND NOT VOTING": "other",
                        }
                        res = {}
                        for key in keys:
                            if key in results:
                                res[keys[key]] = results[key]
                            else:
                                res[keys[key]] = []

                        # Count the number of members voting each way
                        yes, no, other = (
                            len(res["yes"]),
                            len(res["no"]),
                            len(res["other"]),
                        )
                        chambers = {
                            "H": "lower",
                            "S": "upper",
                            "J": "legislature"
                        }

                        # Almost all of the time, a vote only applies to one bill and this loop
                        # will only be run once.
                        # Some exceptions exist.

                        for bill in bills:

                            cur_bill_id = "%s%s%s %s" % bill

                            # Identify the source chamber for the bill
                            try:
                                bc = chambers[cur_bill_id[0]]
                            except KeyError:
                                bc = "other"

                            # Determine whether or not the vote passed
                            if "over the governor's veto" in cur_motion.lower(
                            ):
                                VETO_SUPERMAJORITY = 2 / 3
                                passed = yes / (yes + no) > VETO_SUPERMAJORITY
                            else:
                                passed = yes > no
                            # Create a Vote object based on the scraped information
                            vote = Vote(
                                chamber=chamber,
                                start_date=cur_date.strftime("%Y-%m-%d"),
                                motion_text=cur_motion,
                                result="pass" if passed else "fail",
                                legislative_session=session,
                                classification="passage",
                                bill=cur_bill_id,
                                bill_chamber=bc,
                            )

                            vote.add_source(pdf_url)
                            vote.add_source(url)
                            vote.set_count("yes", yes)
                            vote.set_count("no", no)
                            vote.set_count("other", other)
                            # For each category of voting members,
                            # add the individuals to the Vote object
                            for key in res:
                                for voter in res[key]:
                                    vote.vote(key, voter)

                            # Check the vote counts in the motion text against
                            # the parsed results
                            for category_name in keys.keys():
                                # Need to search for the singular, not plural, in the text
                                # so it can find, for example,  " 1 NAY "
                                vote_re = r"(\d+)\s{}".format(
                                    category_name[:-1])
                                motion_count = int(
                                    re.findall(vote_re, cur_motion)[0])

                                for item in vote.counts:
                                    if item["option"] == keys[category_name]:
                                        vote_count = item["value"]

                                if motion_count != vote_count:
                                    self.warning(
                                        "Motion text vote counts ({}) ".format(
                                            motion_count) +
                                        "differed from roll call counts ({}) ".
                                        format(vote_count) +
                                        "for {0} on {1}".format(
                                            category_name, cur_bill_id))

                                    for item in vote.counts:
                                        if item["option"] == keys[
                                                category_name]:
                                            vote_count = motion_count

                            yield vote

                        # With the vote successfully processed,
                        # wipe its data and continue to the next one
                        results = {}
                        cur_motion = ""
                        bills = []
Ejemplo n.º 9
0
    def scrape_vote(self, bill, name, url):
        if "VOTE/h" in url:
            vote_chamber = "lower"
            cols = (1, 5, 9, 13)
            name_offset = 3
            yes_offset = 0
            no_offset = 1
        else:
            vote_chamber = "upper"
            cols = (1, 6)
            name_offset = 4
            yes_offset = 1
            no_offset = 2

        page = self.get(url, verify=False).text

        if "BUDGET ADDRESS" in page:
            return

        page = lxml.html.fromstring(page)

        yes_count = page.xpath("string(//span[contains(., 'Those voting Yea')])")
        yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1))

        no_count = page.xpath("string(//span[contains(., 'Those voting Nay')])")
        no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1))

        other_count = page.xpath("string(//span[contains(., 'Those absent')])")
        other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1))

        need_count = page.xpath("string(//span[contains(., 'Necessary for')])")
        need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1))

        date = page.xpath("string(//span[contains(., 'Taken on')])")
        date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1)
        date = date.replace(" ", "")
        date = datetime.datetime.strptime(
            date + " " + bill.legislative_session, "%m/%d %Y"
        ).date()

        # not sure about classification.
        vote = Vote(
            chamber=vote_chamber,
            start_date=date,
            motion_text=name,
            result="pass" if yes_count > need_count else "fail",
            classification="passage",
            bill=bill,
        )
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)
        vote.add_source(url)
        table = page.xpath("//table")[0]
        for row in table.xpath("tr"):
            for i in cols:
                name = row.xpath("string(td[%d])" % (i + name_offset)).strip()

                if not name or name == "VACANT":
                    continue
                name = string.capwords(name)
                if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)):
                    vote.yes(name)
                elif "N" in row.xpath("string(td[%d])" % (i + no_offset)):
                    vote.no(name)
                else:
                    vote.vote("other", name)

        yield vote
Ejemplo n.º 10
0
    def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
        try:
            page = self.get(url).text
        except scrapelib.HTTPError:
            self.warning("A vote page not found for bill {}".format(
                bill.identifier))
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        descr = page.xpath("//b")[0].text_content()
        if descr == "":
            # New page method
            descr = page.xpath("//center")[0].text

        if "on voice vote" in descr:
            return

        if "committee" in descr.lower():
            yield from self.scrape_committee_vote(bill, actor, date, motion,
                                                  page, url, uniqid)
            return

        passed = None
        if "Passed" in descr:
            passed = True
        elif "Failed" in descr:
            passed = False
        elif "UTAH STATE LEGISLATURE" in descr:
            return
        elif descr.strip() == "-":
            return
        else:
            self.warning(descr)
            raise NotImplementedError("Can't see if we passed or failed")

        headings = page.xpath("//b")[1:]
        votes = page.xpath("//table")
        sets = zip(headings, votes)
        vdict = {}
        for (typ, votes) in sets:
            txt = typ.text_content()
            arr = [x.strip() for x in txt.split("-", 1)]
            if len(arr) != 2:
                continue
            v_txt, count = arr
            v_txt = v_txt.strip()
            count = int(count)
            people = [
                x.text_content().strip()
                for x in votes.xpath(".//font[@face='Arial']")
            ]

            vdict[v_txt] = {"count": count, "people": people}

        vote = Vote(
            chamber=actor,
            start_date=date,
            motion_text=motion,
            result="pass" if passed else "fail",
            bill=bill,
            classification="passage",
            identifier=str(uniqid),
        )
        vote.set_count("yes", vdict["Yeas"]["count"])
        vote.set_count("no", vdict["Nays"]["count"])
        vote.set_count("other", vdict["Absent or not voting"]["count"])
        vote.add_source(url)

        for person in vdict["Yeas"]["people"]:
            vote.yes(person)
        for person in vdict["Nays"]["people"]:
            vote.no(person)
        for person in vdict["Absent or not voting"]["people"]:
            vote.vote("other", person)

        yield vote