Ejemplo n.º 1
0
    def scrape_committee_vote(self, bill, actor, date, motion, page, url,
                              uniqid):
        votes = page.xpath("//table")[0]
        rows = votes.xpath(".//tr")[0]
        if rows[0].text_content() == "Votes:":
            # New webste
            rows = votes.xpath(".//tr")[2]
        yno = rows.xpath(".//td")
        if len(yno) < 3:
            yes = yno[0]
            no, other = None, None
        else:
            yes, _, no, _, other = rows.xpath(".//td")[:5]

        def proc_block(obj, typ):
            if obj is None:
                return {"type": None, "count": None, "votes": []}
            votes = []
            for vote in obj.xpath("./text()"):
                if vote.strip():
                    vote = vote.strip()
                    if vote:
                        votes.append(vote)
            count = len(votes)
            return {"type": typ, "count": count, "votes": votes}

        vote_dict = {
            "yes": proc_block(yes, "yes"),
            "no": proc_block(no, "no"),
            "other": proc_block(other, "other"),
        }

        yes_count = vote_dict["yes"]["count"]
        no_count = vote_dict["no"]["count"] or 0
        other_count = vote_dict["other"]["count"] or 0
        vote = Vote(
            chamber=actor,
            start_date=date,
            motion_text=motion,
            identifier=str(uniqid),
            result="pass" if (yes_count > no_count) else "fail",
            classification="passage",
            bill=bill,
        )
        vote.extras = {"_vote_id": uniqid}
        vote.add_source(url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)
        for key in vote_dict:
            for voter in vote_dict[key]["votes"]:
                vote.vote(key, voter)

        yield vote
Ejemplo n.º 2
0
    def scrape_bill_type(
            self,
            chamber,
            session,
            bill_type,
            type_abbr,
            committee_abbr_regex=get_committee_name_regex(),
    ):
        bills = (self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr))

        archive_year = int(session[0:4])
        not_archive_year = archive_year >= 2009

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id
            if bill_id.strip() == "SB77" and session == "20052006":
                continue

            fsbill = Bill(bill_id, bill_session, title="", chamber=chamber)
            if (bill_id.startswith("S")
                    and chamber == "lower") or (bill_id.startswith("A")
                                                and chamber == "upper"):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # Construct a fake source url
            source_url = ("http://leginfo.legislature.ca.gov/faces/"
                          "billNavClient.xhtml?bill_id=%s") % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type="text/html")

            title = ""
            type_ = ["bill"]
            subject = ""
            all_titles = set()
            summary = ""

            # Get digest test (aka "summary") from latest version.
            if bill.versions and not_archive_year:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = "//caml:DigestText/xhtml:p"
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r"\s+", " ", t)
                    t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t)
                    chunks.append(t)
                summary = "\n\n".join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime("%m/%d/%y")
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type="application/pdf",
                    date=version_date.date(),
                )

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ("AB", "SB"):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(
                            version.short_title) and not version.title.lower(
                            ).startswith("an act"):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == "Yes":
                    type_.append("appropriation")

                tags = []
                if version.fiscal_committee == "Yes":
                    tags.append("fiscal committee")
                if version.local_program == "Yes":
                    tags.append("local program")
                if version.urgency == "Yes":
                    tags.append("urgency")
                if version.taxlevy == "Yes":
                    tags.append("tax levy")

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note="summary")
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras["impact_clause"] = impact_clause
            fsbill.extras["tags"] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == "Y",
                    entity_type="person",
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {
                        "Assembly": "lower",
                        "Senate": "upper"
                    }[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                "Assembly": "lower",
                                "Senate": "upper"
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r"^(Assembly|Senate)", replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r"\s+", " ", act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r"Com[s]?. on",
                             action.action) and not matched_abbrs:
                    msg = "Failed to extract committee abbr from %r."
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ("Mapping contains no committee name for "
                                   "abbreviation %r. Action text was %r.")
                            args = (abbr, action.action)
                            self.warning(msg % args)

                    committees = filter(None, committees)
                    kwargs["committees"] = committees

                    code = re.search(r"C[SXZ]\d+", actor)
                    if code is not None:
                        code = code.group()
                        kwargs["actor_info"] = {"committee_code": code}
                    if not_archive_year:
                        assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace("Coms. on ", "")
                        act_str = act_str.replace("Com. on " + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith("."):
                            act_str = act_str + "."

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ["upper", "lower", "legislature"]:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = "legislature"

                if actor != action.actor:
                    actor_info = kwargs.get("actor_info", {})
                    actor_info["details"] = action.actor
                    kwargs["actor_info"] = actor_info

                # Add strings for related legislators, if any.
                rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+"
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs["legislators"] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime("%Y-%m-%d"),
                    chamber=actor,
                    classification=kwargs["classification"],
                )
                for committee in kwargs.get("committees", []):
                    action.add_related_entity(committee,
                                              entity_type="organization")
                seen_actions.add((actor, act_str, date))

            source_url = (
                "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
            )
            source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}"

            # Votes for non archived years
            if archive_year > 2009:
                for vote_num, vote in enumerate(bill.votes):
                    if vote.vote_result == "(PASS)":
                        result = True
                    else:
                        result = False

                    if not vote.location:
                        continue

                    full_loc = vote.location.description
                    first_part = full_loc.split(" ")[0].lower()
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    else:
                        # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment
                        continue

                    if vote.motion:
                        motion = vote.motion.motion_text or ""
                    else:
                        motion = ""

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"

                    motion = motion.strip()
                    motion = re.compile(r"(\w+)( Extraordinary)? Session$",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.compile(r"^(Senate|Assembly) ",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ",
                                    "", motion)
                    motion = re.sub(r" \(\w+\)$", "", motion)
                    motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "",
                                    motion)
                    motion = re.sub(
                        r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? "
                        r"Urgency Clause$",
                        "(Urgency Clause)",
                        motion,
                    )
                    motion = re.sub(r"\s+", " ", motion)

                    if not motion:
                        self.warning("Got blank motion on vote for %s" %
                                     bill_id)
                        continue

                    # XXX this is responsible for all the CA 'committee' votes, not
                    # sure if that's a feature or bug, so I'm leaving it as is...
                    # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                    # org = {
                    # 'name': vote_location,
                    # 'classification': vote_classification
                    # }

                    fsvote = VoteEvent(
                        motion_text=motion,
                        start_date=self._tz.localize(vote.vote_date_time),
                        result="pass" if result else "fail",
                        classification=vtype,
                        # organization=org,
                        chamber=vote_chamber,
                        bill=fsbill,
                    )
                    fsvote.extras = {"threshold": vote.threshold}

                    fsvote.add_source(source_url)
                    fsvote.pupa_id = source_url + "#" + str(vote_num)

                    rc = {"yes": [], "no": [], "other": []}
                    for record in vote.votes:
                        if record.vote_code == "AYE":
                            rc["yes"].append(record.legislator_name)
                        elif record.vote_code.startswith("NO"):
                            rc["no"].append(record.legislator_name)
                        else:
                            rc["other"].append(record.legislator_name)

                    # Handle duplicate votes
                    for key in rc.keys():
                        rc[key] = list(set(rc[key]))

                    for key, voters in rc.items():
                        for voter in voters:
                            fsvote.vote(key, voter)
                        # Set counts by summed votes for accuracy
                        fsvote.set_count(key, len(voters))

                    yield fsvote
            if len(bill.votes) > 0 and archive_year <= 2009:
                vote_page_url = (
                    "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
                )
                vote_page_url += (
                    f"bill_id={session}{bill.session_num}{fsbill.identifier}")

                # parse the bill data page, finding the latest html text
                data = self.get(vote_page_url).content
                doc = html.fromstring(data)
                doc.make_links_absolute(vote_page_url)
                num_of_votes = len(doc.xpath("//div[@class='status']"))
                for vote_section in range(1, num_of_votes + 1):
                    lines = doc.xpath(
                        f"//div[@class='status'][{vote_section}]//div[@class='statusRow']"
                    )
                    date, result, motion, vtype, location = "", "", "", "", ""
                    votes = {}
                    for line in lines:
                        line = line.text_content().split()
                        if line[0] == "Date":
                            date = line[1]
                            date = datetime.datetime.strptime(date, "%m/%d/%y")
                            date = self._tz.localize(date)
                        elif line[0] == "Result":
                            result = "pass" if "PASS" in line[1] else "fail"
                        elif line[0] == "Motion":
                            motion = " ".join(line[1:])
                        elif line[0] == "Location":
                            location = " ".join(line[1:])
                        elif len(line) > 1:
                            if line[0] == "Ayes" and line[1] != "Count":
                                votes["yes"] = line[1:]
                            elif line[0] == "Noes" and line[1] != "Count":
                                votes["no"] = line[1:]
                            elif line[0] == "NVR" and line[1] != "Count":
                                votes["not voting"] = line[1:]
                    # Determine chamber based on location
                    first_part = location.split(" ")[0].lower()
                    vote_chamber = ""
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"
                    if len(motion) > 0:
                        fsvote = VoteEvent(
                            motion_text=motion,
                            start_date=date,
                            result=result,
                            classification=vtype,
                            chamber=vote_chamber,
                            bill=fsbill,
                        )
                        fsvote.add_source(vote_page_url)
                        fsvote.pupa_id = vote_page_url + "#" + str(
                            vote_section)

                        for how_voted, voters in votes.items():
                            for voter in voters:
                                voter = voter.replace(",", "")
                                fsvote.vote(how_voted, voter)
                        yield fsvote

            yield fsbill
            self.session.expire_all()