Ejemplo n.º 1
0
def test_vote_event_pupa_identifier_dedupe():
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')
    Organization.objects.create(id='org-id',
                                name='Legislature',
                                classification='legislature',
                                jurisdiction=j)

    vote_event = ScrapeVoteEvent(legislative_session='1900',
                                 start_date='2013',
                                 classification='anything',
                                 result='passed',
                                 motion_text='a vote on something',
                                 identifier='Roll Call No. 1')
    vote_event.pupa_id = 'foo'

    dmi = DumbMockImporter()
    oi = OrganizationImporter('jid')
    bi = BillImporter('jid', dmi, oi)

    _, what = VoteEventImporter('jid', dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 1

    # same exact vote event, no changes
    _, what = VoteEventImporter('jid', dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == 'noop'
    assert VoteEvent.objects.count() == 1

    # new info, update
    vote_event.result = 'failed'
    _, what = VoteEventImporter('jid', dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == 'update'
    assert VoteEvent.objects.count() == 1

    # new bill identifier, update
    vote_event.identifier = 'First Roll Call'
    _, what = VoteEventImporter('jid', dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == 'update'
    assert VoteEvent.objects.count() == 1

    # new pupa identifier, insert
    vote_event.pupa_id = 'bar'
    _, what = VoteEventImporter('jid', dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 2
Ejemplo n.º 2
0
def test_vote_event_bill_actions_two_stage():
    # this test is very similar to what we're testing in test_vote_event_bill_actions w/
    # ve3 and ve4, that two bills that reference the same action won't conflict w/ the
    # OneToOneField, but in this case we do it in two stages so that the conflict is found
    # even if the votes weren't in the same scrape
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')
    org1 = ScrapeOrganization(name='House', classification='lower')
    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id)

    bill.add_action(description='passage', date='1900-04-02', chamber='lower')

    ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                          start_date='1900-04-02', classification='passage:bill',
                          result='pass', bill_chamber='lower', bill='HB 1',
                          bill_action='passage',
                          organization=org1._id)
    ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                          start_date='1900-04-02', classification='passage:bill',
                          result='pass', bill_chamber='lower', bill='HB 1',
                          bill_action='passage',
                          organization=org1._id)
    # disambiguate them
    ve1.pupa_id = 'one'
    ve2.pupa_id = 'two'

    oi = OrganizationImporter('jid')
    oi.import_data([org1.as_dict()])

    bi = BillImporter('jid', oi, DumbMockImporter())
    bi.import_data([bill.as_dict()])

    # first imports just fine
    VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([
        ve1.as_dict(),
    ])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 1
    assert votes[0].bill_action is not None

    # when second is imported, ensure that action stays pinned to first just as it would
    # have if they were both in same import
    VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([
        ve1.as_dict(),
        ve2.as_dict(),
    ])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 2
    assert votes[0].bill_action is not None
    assert votes[1].bill_action is None
Ejemplo n.º 3
0
    def parse_vote_page(self, vote_url, bill):
        vote_html = self.get(vote_url).text
        doc = lxml.html.fromstring(vote_html)
        # chamber
        if "senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        # date in the following format: Mar 23, 2009
        date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text
        date = date.replace(u"\xa0", " ")
        date = datetime.datetime.strptime(date[18:], "%b %d, %Y")

        # motion
        motion = "".join(x.text_content() for x in doc.xpath('//td[@colspan="23"]'))
        if motion == "":
            motion = "No motion given"  # XXX: Double check this. See SJ 3.
        motion = motion.replace(u"\xa0", " ")

        # totals
        tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class")
        totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:]
        yes_count = int(totals[0].split()[-1])
        no_count = int(totals[1].split()[-1])
        other_count = int(totals[2].split()[-1])
        other_count += int(totals[3].split()[-1])
        other_count += int(totals[4].split()[-1])
        passed = yes_count > no_count

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )
        vote.pupa_id = vote_url  # contains sequence number
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)

        # go through, find Voting Yea/Voting Nay/etc. and next tds are voters
        func = None
        for td in doc.xpath("//td/text()"):
            td = td.replace(u"\xa0", " ")
            if td.startswith("Voting Yea"):
                func = vote.yes
            elif td.startswith("Voting Nay"):
                func = vote.no
            elif td.startswith("Not Voting"):
                func = vote.other
            elif td.startswith("Excused"):
                func = vote.other
            elif func:
                td = td.rstrip("*")
                func(td)

        return vote
Ejemplo n.º 4
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = VoteEvent(
            chamber="upper",
            start_date=date.strftime("%Y-%m-%d"),
            motion_text="Passage",
            # setting 'fail' for now.
            result="fail",
            classification="passage",
            bill=bill,
        )
        vote.add_source(url)
        vote.pupa_id = url

        text = convert_pdf(filename, "text").decode("utf-8")
        os.remove(filename)

        if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url, date)
            return

        data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1]
        data = filter(None, data)
        keymap = dict(yea="yes", nay="no")
        actual_vote = collections.defaultdict(int)
        vote_count = {"yes": 0, "no": 0, "other": 0}
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), "other")
            values = data.pop()
            for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values):
                if name.lower().strip() == "none.":
                    continue
                name = name.replace("..", "")
                name = re.sub(r"\.$", "", name)
                name = name.strip("-1234567890 \n")
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = (
            "pass"
            if vote_count["yes"] > (vote_count["no"] + vote_count["other"])
            else "fail"
        )

        yield vote
Ejemplo n.º 5
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r'Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)', text)
        yes, no = int(votes[0][0]), int(votes[0][1])

        vtype = 'other'
        for regex, type in motion_classifiers.items():
            if re.match(regex, text):
                vtype = type
                break

        v = VoteEvent(
            chamber=chamber,
            start_date=TIMEZONE.localize(date),
            motion_text=text,
            result='pass' if yes > no else 'fail',
            classification=vtype,
            bill=bill,
        )
        v.pupa_id = url.split('/')[-1]
        v.set_count('yes', yes)
        v.set_count('no', no)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if 'av' in url:
                self.add_house_votes(v, url)
            elif 'sv' in url:
                self.add_senate_votes(v, url)

        return v
Ejemplo n.º 6
0
def build_vote(session, bill_id, url, vote_record, chamber, motion_text):
    # When they vote in a substitute they mark it as XHB
    bill_id = bill_id.replace('XHB', 'HB')
    passed = len(vote_record['yes']) > len(vote_record['no'])
    vote_event = VoteEvent(
        result='pass' if passed else 'fail',
        chamber=chamber,
        start_date=vote_record['date'].strftime('%Y-%m-%d'),
        motion_text=motion_text,
        classification='passage',
        legislative_session=session,
        bill=bill_id,
        bill_chamber='upper' if bill_id[0] == 'S' else 'lower'
    )
    vote_event.pupa_id = url
    vote_event.set_count('yes', len(vote_record['yes']))
    vote_event.set_count('no', len(vote_record['no']))
    vote_event.set_count('excused', len(vote_record['excused']))
    vote_event.set_count('absent', len(vote_record['absent']))
    vote_event.set_count('other', len(vote_record['other']))
    for vote_type in ['yes', 'no', 'excused', 'absent', 'other']:
        for voter in vote_record[vote_type]:
            vote_event.vote(vote_type, voter)

    vote_event.add_source(url)
    return vote_event
Ejemplo n.º 7
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r'Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)', text)
        yes, no = int(votes[0][0]), int(votes[0][1])

        vtype = 'other'
        for regex, type in motion_classifiers.items():
            if re.match(regex, text):
                vtype = type
                break

        v = VoteEvent(
            chamber=chamber,
            start_date=TIMEZONE.localize(date),
            motion_text=text,
            result='pass' if yes > no else 'fail',
            classification=vtype,
            bill=bill,
        )
        v.pupa_id = url.split('/')[-1]
        v.set_count('yes', yes)
        v.set_count('no', no)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if 'av' in url:
                self.add_house_votes(v, url)
            elif 'sv' in url:
                self.add_senate_votes(v, url)

        return v
Ejemplo n.º 8
0
def build_vote(session, bill_id, url, vote_record, chamber, motion_text):
    # When they vote in a substitute they mark it as XHB
    bill_id = bill_id.replace('XHB', 'HB')
    passed = len(vote_record['yes']) > len(vote_record['no'])
    vote_event = VoteEvent(
        result='pass' if passed else 'fail',
        chamber=chamber,
        start_date=vote_record['date'].strftime('%Y-%m-%d'),
        motion_text=motion_text,
        classification='passage',
        legislative_session=session,
        bill=bill_id,
        bill_chamber='upper' if bill_id[0] == 'S' else 'lower')
    vote_event.pupa_id = url
    vote_event.set_count('yes', len(vote_record['yes']))
    vote_event.set_count('no', len(vote_record['no']))
    vote_event.set_count('excused', len(vote_record['excused']))
    vote_event.set_count('absent', len(vote_record['absent']))
    vote_event.set_count('other', len(vote_record['other']))
    for vote_type in ['yes', 'no', 'excused', 'absent', 'other']:
        for voter in vote_record[vote_type]:
            vote_event.vote(vote_type, voter)

    vote_event.add_source(url)
    return vote_event
Ejemplo n.º 9
0
 def scrape_votes(self, bill, bill_page, chamber):
     vote_links = bill_page.xpath(
         '//div[contains(@class, "col-sm-8")]//a[contains(@href, "view_votes")]')
     for vote_link in vote_links:
         vote_url = vote_link.attrib['href']
         date_td, motion_td, *_ = vote_link.xpath('ancestor::tr/td')
         date = datetime.strptime(date_td.text, '%b %d, %Y')
         motion_text = motion_td.text_content()
         vote_page = self.lxmlize(vote_url)
         passed = (
             'Passed' in motion_text or
             'Advanced' in motion_text
         )
         cells = vote_page.xpath('//table[contains(@class, "calendar-table")]//td')
         vote = VoteEvent(
             bill=bill,
             chamber=chamber,
             start_date=TIMEZONE.localize(date),
             motion_text=motion_text,
             classification='passage',
             result='pass' if passed else 'fail',
         )
         query_params = urllib.parse.parse_qs(urllib.parse.urlparse(vote_url).query)
         vote.pupa_id = query_params['KeyID'][0]
         vote.add_source(vote_url)
         for chunk in range(0, len(cells), 2):
             name = cells[chunk].text
             vote_type = cells[chunk + 1].text
             if name and vote_type:
                 vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), 'other'), name)
         yield vote
Ejemplo n.º 10
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r"Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)",
                           text)
        yes, no = int(votes[0][0]), int(votes[0][1])

        vtype = "other"
        for regex, type in motion_classifiers.items():
            if re.match(regex, text):
                vtype = type
                break

        v = VoteEvent(
            chamber=chamber,
            start_date=TIMEZONE.localize(date),
            motion_text=text,
            result="pass" if yes > no else "fail",
            classification=vtype,
            bill=bill,
        )
        v.pupa_id = url.split("/")[-1]
        v.set_count("yes", yes)
        v.set_count("no", no)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if "av" in url:
                self.add_house_votes(v, url)
            elif "sv" in url:
                self.add_senate_votes(v, url)

        return v
Ejemplo n.º 11
0
def build_vote(session, bill_id, url, vote_record, chamber, motion_text):
    # When they vote in a substitute they mark it as XHB
    bill_id = bill_id.replace("XHB", "HB")
    passed = len(vote_record["yes"]) > len(vote_record["no"])
    vote_event = VoteEvent(
        result="pass" if passed else "fail",
        chamber=chamber,
        start_date=vote_record["date"].strftime("%Y-%m-%d"),
        motion_text=motion_text,
        classification="passage",
        legislative_session=session,
        bill=bill_id,
        bill_chamber="upper" if bill_id[0] == "S" else "lower",
    )
    vote_event.pupa_id = url
    vote_event.set_count("yes", len(vote_record["yes"]))
    vote_event.set_count("no", len(vote_record["no"]))
    vote_event.set_count("excused", len(vote_record["excused"]))
    vote_event.set_count("absent", len(vote_record["absent"]))
    vote_event.set_count("other", len(vote_record["other"]))
    for vote_type in ["yes", "no", "excused", "absent", "other"]:
        for voter in vote_record[vote_type]:
            vote_event.vote(vote_type, voter)

    vote_event.add_source(url)
    return vote_event
Ejemplo n.º 12
0
    def scrape_vote(self, chamber, session, bill_id, vote_url):
        NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp'
        resp = self.get(vote_url)
        html = resp.text

        # sometimes the link is broken, will redirect to NO_VOTE_URL
        if resp.url == NO_VOTE_URL:
            return

        doc = lxml.html.fromstring(html)
        try:
            motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0]
        except IndexError:
            self.logger.warning("Bill was missing a motion number, skipping")
            return

        vote_count = doc.xpath(
            ".//div[@id='leg_PageContent']/div/h3/text()")[1].split()
        yeas = int(vote_count[0])
        nays = int(vote_count[3])

        # second paragraph has date
        paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()")
        date = None
        for p in paragraphs:
            try:
                date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date()
                break
            except ValueError:
                pass
        if date is None:
            self.logger.warning("No date could be found for vote on %s" %
                                motion)
            return

        vote = VoteEvent(chamber='lower',
                         start_date=date,
                         motion_text=motion,
                         result='pass' if yeas > nays else 'fail',
                         classification='passage',
                         legislative_session=session,
                         bill=bill_id,
                         bill_chamber=chamber)
        vote.set_count('yes', yeas)
        vote.set_count('no', nays)
        vote.add_source(vote_url)
        vote.pupa_id = vote_url

        # first table has YEAs
        for name in doc.xpath('//table[1]/tr/td/font/text()'):
            vote.yes(name.strip())

        # second table is nays
        for name in doc.xpath('//table[2]/tr/td/font/text()'):
            vote.no(name.strip())

        yield vote
Ejemplo n.º 13
0
    def parse_vote(self, bill, link):
        member_doc = lxml.html.fromstring(self.get(link).text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        opinions = member_doc.xpath("//div[@id='main_content']/h3/text()")
        if len(opinions) > 0:
            temp = opinions[0].split()
            vote_chamber = temp[0]
            vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y')
            vote_status = " ".join(temp[2:-2])
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except ValueError:
                    # This is likely not a vote-count text chunk
                    # It's probably '`On roll call the vote was:`
                    pass
                else:
                    if "yea" in i.lower():
                        yes_count = count
                    elif "nay" in i.lower():
                        no_count = count
                    elif "present" in i.lower():
                        p_count = count
                    elif "absent" in i.lower():
                        a_count = count

            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime('%Y-%m-%d'),
                chamber=vote_chamber,
                motion_text=vote_status,
                result='pass' if yes_count > no_count else 'fail',
                classification='passage',
            )
            vote.pupa_id = link

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('abstain', p_count)
            vote.set_count('absent', a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote('yes', re.sub(',', '', a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote('no', re.sub(',', '', a_links[i]).split()[0])
                else:
                    vote.vote('other', re.sub(',', '', a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Ejemplo n.º 14
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = VoteEvent(
            chamber='upper',
            start_date=date.strftime("%Y-%m-%d"),
            motion_text='Passage',
            # setting 'fail' for now.
            result='fail',
            classification='passage',
            bill=bill)
        vote.add_source(url)
        vote.pupa_id = url

        text = convert_pdf(filename, 'text').decode('utf-8')
        os.remove(filename)

        if re.search(r'Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url,
                                                    date)
            return

        data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1]
        data = filter(None, data)
        keymap = dict(yea='yes', nay='no')
        actual_vote = collections.defaultdict(int)
        vote_count = {'yes': 0, 'no': 0, 'other': 0}
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), 'other')
            values = data.pop()
            for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values):
                if name.lower().strip() == 'none.':
                    continue
                name = name.replace('..', '')
                name = re.sub(r'\.$', '', name)
                name = name.strip('-1234567890 \n')
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = 'pass' if vote_count['yes'] > (
            vote_count['no'] + vote_count['other']) else 'fail'

        yield vote
Ejemplo n.º 15
0
def test_vote_event_pupa_identifier_dedupe():
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')
    Organization.objects.create(id='org-id', name='Legislature',
                                classification='legislature',
                                jurisdiction=j)

    vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013',
                                 classification='anything', result='passed',
                                 motion_text='a vote on something',
                                 identifier='Roll Call No. 1')
    vote_event.pupa_id = 'foo'

    dmi = DumbMockImporter()
    oi = OrganizationImporter('jid')
    bi = BillImporter('jid', dmi, oi)

    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 1

    # same exact vote event, no changes
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'noop'
    assert VoteEvent.objects.count() == 1

    # new info, update
    vote_event.result = 'failed'
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'update'
    assert VoteEvent.objects.count() == 1

    # new bill identifier, update
    vote_event.identifier = 'First Roll Call'
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'update'
    assert VoteEvent.objects.count() == 1

    # new pupa identifier, insert
    vote_event.pupa_id = 'bar'
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 2
Ejemplo n.º 16
0
    def parse_vote(self, bill, link):
        member_doc = lxml.html.fromstring(self.get(link).text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        opinions = member_doc.xpath("//div[@id='main_content']/h3/text()")
        if len(opinions) > 0:
            temp = opinions[0].split()
            vote_chamber = temp[0]
            vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y')
            vote_status = " ".join(temp[2:-2])
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except:
                    pass
                if "yea" in i.lower():
                    yes_count = count
                elif "nay" in i.lower():
                    no_count = count
                elif "present" in i.lower():
                    p_count = count
                elif "absent" in i.lower():
                    a_count = count
            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime('%Y-%m-%d'),
                chamber=vote_chamber,
                motion_text=vote_status,
                result='pass' if yes_count > no_count else 'fail',
                classification='passage',
            )
            vote.pupa_id = link

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('abstain', p_count)
            vote.set_count('absent', a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote('yes', re.sub(',', '', a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote('no', re.sub(',', '', a_links[i]).split()[0])
                else:
                    vote.vote('other', re.sub(',', '', a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Ejemplo n.º 17
0
    def scrape_vote(self, chamber, session, bill_id, vote_url):
        NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp'
        resp = self.get(vote_url)
        html = resp.text

        # sometimes the link is broken, will redirect to NO_VOTE_URL
        if resp.url == NO_VOTE_URL:
            return

        doc = lxml.html.fromstring(html)
        try:
            motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0]
        except IndexError:
            self.logger.warning("Bill was missing a motion number, skipping")
            return

        vote_count = doc.xpath(".//div[@id='leg_PageContent']/div/h3/text()")[1].split()
        yeas = int(vote_count[0])
        nays = int(vote_count[3])

        # second paragraph has date
        paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()")
        date = None
        for p in paragraphs:
            try:
                date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date()
                break
            except ValueError:
                pass
        if date is None:
            self.logger.warning("No date could be found for vote on %s" % motion)
            return

        vote = VoteEvent(chamber='lower', start_date=date, motion_text=motion,
                         result='pass' if yeas > nays else 'fail',
                         classification='passage',
                         legislative_session=session, bill=bill_id,
                         bill_chamber=chamber)
        vote.set_count('yes', yeas)
        vote.set_count('no', nays)
        vote.add_source(vote_url)
        vote.pupa_id = vote_url

        # first table has YEAs
        for name in doc.xpath('//table[1]/tr/td/font/text()'):
            vote.yes(name.strip())

        # second table is nays
        for name in doc.xpath('//table[2]/tr/td/font/text()'):
            vote.no(name.strip())

        yield vote
Ejemplo n.º 18
0
    def scrape_chamber_votes(self, chamber, session):
        url = {
            "upper": "%s/%s" % (RI_URL_BASE, "SVotes"),
            "lower": "%s/%s" % (RI_URL_BASE, "HVotes")
        }[chamber]
        action = "%s/%s" % (url, "votes.asp")
        dates = self.get_vote_dates(url, session)
        for date in dates:
            votes = self.parse_vote_page(self.post_to(action, date), url,
                                         session)
            for vote_dict in votes:
                for vote in vote_dict.values():
                    count = vote['count']
                    chamber = {
                        "H": "lower",
                        "S": "upper"
                    }[vote['meta']['chamber']]

                    try:
                        bill_id = self._bill_id_by_type[(chamber,
                                                         vote['meta']['bill'])]
                    except:
                        self.warning('no such bill_id %s %s', chamber,
                                     vote['meta']['bill'])
                        continue

                    v = VoteEvent(
                        chamber=chamber,
                        start_date=vote['time'].strftime('%Y-%m-%d'),
                        motion_text=vote['meta']['extra']['motion'],
                        result='pass' if count['passage'] else 'fail',
                        classification='passage',
                        legislative_session=session,
                        bill=bill_id,
                        bill_chamber=chamber,
                    )
                    v.set_count('yes', int(count['YEAS']))
                    v.set_count('no', int(count['NAYS']))
                    v.set_count('other', int(count['NOT VOTING']))
                    v.add_source(vote['source'])
                    v.pupa_id = vote['source']

                    for vt in vote['votes']:
                        key = {
                            'Y': 'yes',
                            'N': 'no',
                        }.get(vt['vote'], 'other')
                        v.vote(key, vt['name'])
                    yield v
Ejemplo n.º 19
0
    def parse_committee_votes(self, bill, url):
        bill.add_source(url)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        chamber = ('upper'
                   if 'Senate' in doc.xpath('string(//h1)') else 'lower')
        committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip()
        for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"):

            # Date
            for fmt in ("%m/%d/%Y", "%m-%d-%Y"):
                date = link.xpath('../../td')[0].text_content()
                try:
                    date = datetime.datetime.strptime(date, fmt)
                except ValueError:
                    continue
                break

            # Motion
            motion = link.text_content().split(' - ')[-1].strip()
            motion = 'Committee vote (%s): %s' % (committee, motion)

            # Roll call
            vote_url = link.attrib['href']
            rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url)

            vote = VoteEvent(
                chamber=chamber,
                start_date=tz.localize(date),
                motion_text=motion,
                classification='other',
                result='pass' if rollcall['passed'] else 'fail',
                bill=bill,
            )
            vote.pupa_id = vote_url
            vote.set_count('yes', rollcall['yes_count'])
            vote.set_count('no', rollcall['no_count'])
            vote.set_count('other', rollcall['other_count'])

            for voteval in ('yes', 'no', 'other'):
                for name in rollcall.get(voteval + '_votes', []):
                    vote.vote(voteval, name)

            vote.add_source(url)
            vote.add_source(vote_url)

            yield vote
Ejemplo n.º 20
0
    def scrape_votes(self, bill, bill_page, chamber):
        vote_links = bill_page.xpath(
            '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]'
        )
        for vote_link in vote_links:
            vote_url = vote_link.attrib['href']
            date_td, motion_td, *_ = vote_link.xpath('ancestor::tr/td')
            date = datetime.strptime(date_td.text, '%b %d, %Y')
            motion_text = motion_td.text_content()
            vote_page = self.lxmlize(vote_url)
            passed = ('Passed' in motion_text or 'Advanced' in motion_text)
            cells = vote_page.xpath(
                '//div[contains(@class,"table-responsive")]/table//td')
            vote = VoteEvent(
                bill=bill,
                chamber=chamber,
                start_date=TIMEZONE.localize(date),
                motion_text=motion_text,
                classification='passage',
                result='pass' if passed else 'fail',
            )

            yes_count = self.process_count(vote_page, 'Yes:')
            no_count = self.process_count(vote_page, 'No:')
            exc_count = self.process_count(vote_page, 'Excused - Not Voting:')
            absent_count = self.process_count(vote_page,
                                              'Absent - Not Voting:')
            present_count = self.process_count(vote_page,
                                               'Present - Not Voting:')

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', exc_count)
            vote.set_count('absent', absent_count)
            vote.set_count('abstain', present_count)

            query_params = urllib.parse.parse_qs(
                urllib.parse.urlparse(vote_url).query)
            vote.pupa_id = query_params['KeyID'][0]
            vote.add_source(vote_url)
            for chunk in range(0, len(cells), 2):
                name = cells[chunk].text
                vote_type = cells[chunk + 1].text
                if name and vote_type:
                    vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), 'other'),
                              name)
            yield vote
Ejemplo n.º 21
0
    def parse_committee_votes(self, bill, url):
        bill.add_source(url)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        chamber = ('upper' if 'Senate' in doc.xpath('string(//h1)') else 'lower')
        committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip()
        for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"):

            # Date
            for fmt in ("%m/%d/%Y", "%m-%d-%Y"):
                date = link.xpath('../../td')[0].text_content()
                try:
                    date = datetime.datetime.strptime(date, fmt)
                except ValueError:
                    continue
                break

            # Motion
            motion = link.text_content().split(' - ')[-1].strip()
            motion = 'Committee vote (%s): %s' % (committee, motion)

            # Roll call
            vote_url = link.attrib['href']
            rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url)

            vote = VoteEvent(
                chamber=chamber,
                start_date=tz.localize(date),
                motion_text=motion,
                classification='other',
                result='pass' if rollcall['passed'] else 'fail',
                bill=bill,
            )
            vote.pupa_id = vote_url
            vote.set_count('yes', rollcall['yes_count'])
            vote.set_count('no', rollcall['no_count'])
            vote.set_count('other', rollcall['other_count'])

            for voteval in ('yes', 'no', 'other'):
                for name in rollcall.get(voteval + '_votes', []):
                    vote.vote(voteval, name)

            vote.add_source(url)
            vote.add_source(vote_url)

            yield vote
Ejemplo n.º 22
0
    def parse_committee_votes(self, bill, url):
        bill.add_source(url)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower"
        committee = tuple(doc.xpath("//h2")[0].itertext())[-2].strip()
        for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"):

            # Date
            for fmt in ("%m/%d/%Y", "%m-%d-%Y"):
                date = link.xpath("../../td")[0].text_content()
                try:
                    date = datetime.datetime.strptime(date, fmt)
                except ValueError:
                    continue
                break

            # Motion
            motion = link.text_content().split(" - ")[-1].strip()
            motion = "Committee vote (%s): %s" % (committee, motion)

            # Roll call
            vote_url = link.attrib["href"]
            rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url)

            vote = VoteEvent(
                chamber=chamber,
                start_date=tz.localize(date),
                motion_text=motion,
                classification="other",
                result="pass" if rollcall["passed"] else "fail",
                bill=bill,
            )
            vote.pupa_id = vote_url
            vote.set_count("yes", rollcall["yes_count"])
            vote.set_count("no", rollcall["no_count"])
            vote.set_count("other", rollcall["other_count"])

            for voteval in ("yes", "no", "other"):
                for name in rollcall.get(voteval + "_votes", []):
                    vote.vote(voteval, name)

            vote.add_source(url)
            vote.add_source(vote_url)

            yield vote
Ejemplo n.º 23
0
    def scrape_votes(self, bill, bill_page, chamber):
        vote_links = bill_page.xpath(
            '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]'
        )
        for vote_link in vote_links:
            vote_url = vote_link.attrib["href"]
            date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td")
            date = datetime.strptime(date_td.text, "%b %d, %Y")
            motion_text = motion_td.text_content()
            vote_page = self.lxmlize(vote_url)
            passed = "Passed" in motion_text or "Advanced" in motion_text
            cells = vote_page.xpath(
                '//div[contains(@class,"table-responsive")]/table//td')
            vote = VoteEvent(
                bill=bill,
                chamber=chamber,
                start_date=TIMEZONE.localize(date),
                motion_text=motion_text,
                classification="passage",
                result="pass" if passed else "fail",
            )

            yes_count = self.process_count(vote_page, "Yes:")
            no_count = self.process_count(vote_page, "No:")
            exc_count = self.process_count(vote_page, "Excused - Not Voting:")
            absent_count = self.process_count(vote_page,
                                              "Absent - Not Voting:")
            present_count = self.process_count(vote_page,
                                               "Present - Not Voting:")

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", exc_count)
            vote.set_count("absent", absent_count)
            vote.set_count("abstain", present_count)

            query_params = urllib.parse.parse_qs(
                urllib.parse.urlparse(vote_url).query)
            vote.pupa_id = query_params["KeyID"][0]
            vote.add_source(vote_url)
            for chunk in range(0, len(cells), 2):
                name = cells[chunk].text
                vote_type = cells[chunk + 1].text
                if name and vote_type:
                    vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), "other"),
                              name)
            yield vote
Ejemplo n.º 24
0
    def scrape_chamber_votes(self, chamber, session):
        url = {
            "upper": "%s/%s" % (RI_URL_BASE, "SVotes"),
            "lower": "%s/%s" % (RI_URL_BASE, "HVotes"),
        }[chamber]
        action = "%s/%s" % (url, "votes.asp")
        dates = self.get_vote_dates(url, session)
        for date in dates:
            votes = self.parse_vote_page(self.post_to(action, date), url,
                                         session)
            for vote_dict in votes:
                for vote in vote_dict.values():
                    count = vote["count"]
                    chamber = {
                        "H": "lower",
                        "S": "upper"
                    }[vote["meta"]["chamber"]]

                    try:
                        bill_id = self._bill_id_by_type[(chamber,
                                                         vote["meta"]["bill"])]
                    except KeyError:
                        self.warning("no such bill_id %s %s", chamber,
                                     vote["meta"]["bill"])
                        continue

                    v = VoteEvent(
                        chamber=chamber,
                        start_date=vote["time"].strftime("%Y-%m-%d"),
                        motion_text=vote["meta"]["extra"]["motion"],
                        result="pass" if count["passage"] else "fail",
                        classification="passage",
                        legislative_session=session,
                        bill=bill_id,
                        bill_chamber=chamber,
                    )
                    v.set_count("yes", int(count["YEAS"]))
                    v.set_count("no", int(count["NAYS"]))
                    v.set_count("other", int(count["NOT VOTING"]))
                    v.add_source(vote["source"])
                    v.pupa_id = vote["source"]

                    for vt in vote["votes"]:
                        key = {"Y": "yes", "N": "no"}.get(vt["vote"], "other")
                        v.vote(key, vt["name"])
                    yield v
Ejemplo n.º 25
0
    def scrape_votes(self, bill, page):
        base_url = 'https://apps.azleg.gov/api/BillStatusFloorAction'
        for header in page['FloorHeaders']:
            params = {
                'billStatusId': page['BillId'],
                'billStatusActionId': header['BillStatusActionId'],
                'includeVotes': 'true',
            }
            resp = self.get(base_url, params=params)
            actions = json.loads(resp.content.decode('utf-8'))

            for action in actions:
                if action['Action'] == 'No Action':
                    continue
                action_date = datetime.datetime.strptime(action['ReportDate'], '%Y-%m-%dT%H:%M:%S')
                vote = VoteEvent(
                    chamber={
                        'S': 'upper',
                        'H': 'lower',
                    }[header['LegislativeBody']],
                    motion_text=action['Action'],
                    classification='passage',
                    result=(
                        'pass'
                        if action['UnanimouslyAdopted'] or action['Ayes'] > action['Nays']
                        else 'fail'
                    ),
                    start_date=action_date.strftime('%Y-%m-%d'),
                    bill=bill,
                )
                vote.add_source(resp.url)
                vote.set_count('yes', action['Ayes'] or 0)
                vote.set_count('no', action['Nays'] or 0)
                vote.set_count('other', (action['Present'] or 0))
                vote.set_count('absent', (action['Absent'] or 0))
                vote.set_count('excused', (action['Excused'] or 0))
                vote.set_count('not voting', (action['NotVoting'] or 0))

                for v in action['Votes']:
                    vote_type = {
                        'Y': 'yes',
                        'N': 'no',
                    }.get(v['Vote'], 'other')
                    vote.vote(vote_type, v['Legislator']['FullName'])
                vote.pupa_id = resp.url+str(action['ReferralNumber'])
                yield vote
Ejemplo n.º 26
0
    def scrape_chamber_votes(self, chamber, session):
        url = {
            "upper": "%s/%s" % (RI_URL_BASE, "SVotes"),
            "lower": "%s/%s" % (RI_URL_BASE, "HVotes")
        }[chamber]
        action = "%s/%s" % (url, "votes.asp")
        dates = self.get_vote_dates(url, session)
        for date in dates:
            votes = self.parse_vote_page(self.post_to(action, date), url, session)
            for vote_dict in votes:
                for vote in vote_dict.values():
                    count = vote['count']
                    chamber = {
                        "H": "lower",
                        "S": "upper"
                    }[vote['meta']['chamber']]

                    try:
                        bill_id = self._bill_id_by_type[(chamber, vote['meta']['bill'])]
                    except KeyError:
                        self.warning('no such bill_id %s %s', chamber, vote['meta']['bill'])
                        continue

                    v = VoteEvent(
                        chamber=chamber,
                        start_date=vote['time'].strftime('%Y-%m-%d'),
                        motion_text=vote['meta']['extra']['motion'],
                        result='pass' if count['passage'] else 'fail',
                        classification='passage',
                        legislative_session=session,
                        bill=bill_id,
                        bill_chamber=chamber,
                    )
                    v.set_count('yes', int(count['YEAS']))
                    v.set_count('no', int(count['NAYS']))
                    v.set_count('other', int(count['NOT VOTING']))
                    v.add_source(vote['source'])
                    v.pupa_id = vote['source']

                    for vt in vote['votes']:
                        key = {
                            'Y': 'yes',
                            'N': 'no',
                        }.get(vt['vote'], 'other')
                        v.vote(key, vt['name'])
                    yield v
Ejemplo n.º 27
0
    def scrape_votes(self, bill, page):
        base_url = 'https://apps.azleg.gov/api/BillStatusFloorAction'
        for header in page['FloorHeaders']:
            params = {
                'billStatusId': page['BillId'],
                'billStatusActionId': header['BillStatusActionId'],
                'includeVotes': 'true',
            }
            resp = self.get(base_url, params=params)
            actions = json.loads(resp.content.decode('utf-8'))

            for action in actions:
                if action['Action'] == 'No Action':
                    continue
                action_date = datetime.datetime.strptime(
                    action['ReportDate'], '%Y-%m-%dT%H:%M:%S')
                vote = VoteEvent(
                    chamber={
                        'S': 'upper',
                        'H': 'lower',
                    }[header['LegislativeBody']],
                    motion_text=action['Action'],
                    classification='passage',
                    result=('pass' if action['UnanimouslyAdopted']
                            or action['Ayes'] > action['Nays'] else 'fail'),
                    start_date=action_date.strftime('%Y-%m-%d'),
                    bill=bill,
                )
                vote.add_source(resp.url)
                vote.set_count('yes', action['Ayes'] or 0)
                vote.set_count('no', action['Nays'] or 0)
                vote.set_count('other', (action['Present'] or 0))
                vote.set_count('absent', (action['Absent'] or 0))
                vote.set_count('excused', (action['Excused'] or 0))
                vote.set_count('not voting', (action['NotVoting'] or 0))

                for v in action['Votes']:
                    vote_type = {
                        'Y': 'yes',
                        'N': 'no',
                    }.get(v['Vote'], 'other')
                    vote.vote(vote_type, v['Legislator']['FullName'])
                vote.pupa_id = resp.url + str(action['ReferralNumber'])
                yield vote
Ejemplo n.º 28
0
    def scrape_votes(self, bill, page):
        base_url = "https://apps.azleg.gov/api/BillStatusFloorAction"
        for header in page["FloorHeaders"]:
            params = {
                "billStatusId": page["BillId"],
                "billStatusActionId": header["BillStatusActionId"],
                "includeVotes": "true",
            }
            resp = self.get(base_url, params=params)
            actions = json.loads(resp.content.decode("utf-8"))

            for action in actions:
                if action["Action"] == "No Action":
                    continue
                action_date = datetime.datetime.strptime(
                    action["ReportDate"], "%Y-%m-%dT%H:%M:%S")
                vote = VoteEvent(
                    chamber={
                        "S": "upper",
                        "H": "lower"
                    }[header["LegislativeBody"]],
                    motion_text=action["Action"],
                    classification="passage",
                    result=("pass" if action["UnanimouslyAdopted"]
                            or action["Ayes"] > action["Nays"] else "fail"),
                    start_date=action_date.strftime("%Y-%m-%d"),
                    bill=bill,
                )
                vote.add_source(resp.url)
                vote.set_count("yes", action["Ayes"] or 0)
                vote.set_count("no", action["Nays"] or 0)
                vote.set_count("other", (action["Present"] or 0))
                vote.set_count("absent", (action["Absent"] or 0))
                vote.set_count("excused", (action["Excused"] or 0))
                vote.set_count("not voting", (action["NotVoting"] or 0))

                for v in action["Votes"]:
                    vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other")
                    vote.vote(vote_type, v["Legislator"]["FullName"])
                vote.pupa_id = resp.url + str(action["ReferralNumber"])
                yield vote
Ejemplo n.º 29
0
    def asvote(self):
        v = VoteEvent(
            chamber=self.chamber(),
            start_date=self.date(),
            motion_text=self.motion(),
            result='pass' if self.passed() else 'fail',
            classification='passage',
            bill=self.bill,
        )
        v.pupa_id = self.url         # URL contains sequence number
        v.set_count('yes', self.yes_count())
        v.set_count('no', self.no_count())
        v.set_count('other', self.other_count())

        for voter in self.yes_votes():
            v.yes(voter)
        for voter in self.no_votes():
            v.no(voter)
        for voter in self.other_votes():
            v.vote('other', voter)
        v.add_source(self.url)
        return v
Ejemplo n.º 30
0
    def asvote(self):
        v = VoteEvent(
            chamber=self.chamber(),
            start_date=self.date(),
            motion_text=self.motion(),
            result='pass' if self.passed() else 'fail',
            classification='passage',
            bill=self.bill,
        )
        v.pupa_id = self.url         # URL contains sequence number
        v.set_count('yes', self.yes_count())
        v.set_count('no', self.no_count())
        v.set_count('other', self.other_count())

        for voter in self.yes_votes():
            v.yes(voter)
        for voter in self.no_votes():
            v.no(voter)
        for voter in self.other_votes():
            v.vote('other', voter)
        v.add_source(self.url)
        return v
Ejemplo n.º 31
0
    def asvote(self):
        v = VoteEvent(
            chamber=self.chamber(),
            start_date=self.date(),
            motion_text=self.motion(),
            result="pass" if self.passed() else "fail",
            classification="passage",
            bill=self.bill,
        )
        v.pupa_id = self.url  # URL contains sequence number
        v.set_count("yes", self.yes_count())
        v.set_count("no", self.no_count())
        v.set_count("other", self.other_count())

        for voter in self.yes_votes():
            v.yes(voter)
        for voter in self.no_votes():
            v.no(voter)
        for voter in self.other_votes():
            v.vote("other", voter)
        v.add_source(self.url)
        return v
Ejemplo n.º 32
0
    def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results):
        for v in votes["items"]:
            try:
                v["yeas"]
            except KeyError:
                # sometimes the actual vote is buried a second layer deep
                v = self.get(base_url+v["link"]).json()
                try:
                    v["yeas"]
                except KeyError:
                    self.logger.warning("No vote info available, skipping")
                    continue

            try:
                chamber = chamber_dict[v["chamber"]]
            except KeyError:
                chamber = "lower" if "house" in v["apn"] else "upper"
            try:
                date = self._tz.localize(datetime.datetime.strptime(v["date"], "%m/%d/%y"))
                date = "{:%Y-%m-%d}".format(date)
            except KeyError:
                try:
                    date = self._tz.localize(datetime.datetime.strptime(v["occurred"], "%m/%d/%y"))
                    date = "{:%Y-%m-%d}".format(date)
                except KeyError:
                    self.logger.warning("No date found for vote, skipping")
                    continue
            try:
                motion = v["action"]
            except KeyError:
                motion = v["motiontype"]

            # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip
            if (not motion and isinstance(v['yeas'], str)
               and isinstance(v['nays'], str)):
                waringText = 'Malformed JSON found for vote ("revno" of {}); skipping'
                self.warning(waringText.format(v['revno']))
                continue

            result = v.get("results") or v.get("passed")
            if result is None:
                if len(v['yeas']) > len(v['nays']):
                    result = "passed"
                else:
                    result = "failed"

            passed = vote_results[result.lower()]
            if "committee" in v:
                vote = VoteEvent(chamber=chamber,
                                 start_date=date,
                                 motion_text=motion,
                                 result='pass' if passed else 'fail',
                                 # organization=v["committee"],
                                 bill=bill,
                                 classification='passed'
                                 )
            else:
                vote = VoteEvent(chamber=chamber,
                                 start_date=date,
                                 motion_text=motion,
                                 result='pass' if passed else 'fail',
                                 classification='passed',
                                 bill=bill
                                 )
            # Concatenate the bill identifier and vote identifier to avoid collisions
            vote.pupa_id = '{}:{}'.format(bill.identifier.replace(' ', ''), v['revno'])
            # the yea and nay counts are not displayed, but vote totals are
            # and passage status is.
            yes_count = 0
            no_count = 0
            absent_count = 0
            excused_count = 0
            for voter_id in v["yeas"]:
                vote.yes(legislators[voter_id])
                yes_count += 1
            for voter_id in v["nays"]:
                vote.no(legislators[voter_id])
                no_count += 1
            if "absent" in v:
                for voter_id in v["absent"]:
                    vote.vote('absent', legislators[voter_id])
                    absent_count += 1
            if "excused" in v:
                for voter_id in v["excused"]:
                    vote.vote('excused', legislators[voter_id])
                    excused_count += 1

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('absent', absent_count)
            vote.set_count('excused', excused_count)
            # check to see if there are any other things that look
            # like vote categories, throw a warning if so
            for key, val in v.items():
                if (type(val) == list and len(val) > 0 and
                   key not in ["yeas", "nays", "absent", "excused"]):
                    if val[0] in legislators:
                        self.logger.warning("{k} looks like a vote type that's not being counted."
                                            " Double check it?".format(k=key))
            vote.add_source(url)

            yield vote
Ejemplo n.º 33
0
    def scrape_votes(self, url, motion, date, chamber, bill):
        vote_pdf, resp = self.urlretrieve(url)
        text = convert_pdf(vote_pdf, 'text')
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []
        absent_votes = []
        not_voting_votes = []
        # point at array to add names to
        cur_array = None

        precursors = (
            ('yeas--', yes_votes),
            ('nays--', no_votes),
            ('absent or those not voting--', absent_votes),
            ('absent and those not voting--', absent_votes),
            ('not voting--', not_voting_votes),
            ('voting present--', other_votes),
            ('present--', other_votes),
            ('disclaimer', None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.decode().split('\n'))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line.lower():
                    cur_array = arr
                    line = line.replace(pc, '')

            # split names
            for name in line.split(','):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if 'None.' in name:
                    cur_array = None

                match = re.match(r'(.+?)\. Total--.*', name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in ('on final passage', 'Necessary', 'who would have',
                             'being a tie', 'therefore', 'Vacancies', 'a pair',
                             'Total-', 'ATTORNEY', 'on final passage',
                             'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR',
                             'ARCHIVES', 'SECRETARY'):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == '.':
                        name = name[:-1]
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        absent_count = len(absent_votes)
        not_voting_count = len(not_voting_votes)
        other_count = len(other_votes)

        vote = VoteEvent(chamber=chamber,
                         start_date=self._tz.localize(date),
                         motion_text=motion,
                         result='pass' if passed else 'fail',
                         classification='passage',
                         bill=bill)
        vote.pupa_id = url + '#' + bill.identifier

        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('absent', absent_count)
        vote.set_count('not voting', not_voting_count)
        vote.set_count('other', other_count)
        vote.add_source(url)
        for yes_vote in yes_votes:
            vote.vote('yes', yes_vote)
        for no_vote in no_votes:
            vote.vote('no', no_vote)
        for absent_vote in absent_votes:
            vote.vote('absent', absent_vote)
        for not_voting_vote in not_voting_votes:
            vote.vote('not voting', not_voting_vote)
        for other_vote in other_votes:
            vote.vote('other', other_vote)
        yield vote
Ejemplo n.º 34
0
    def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results):
        for v in votes["items"]:
            try:
                v["yeas"]
            except KeyError:
                # sometimes the actual vote is buried a second layer deep
                v = self.get(base_url+v["link"]).json()
                try:
                    v["yeas"]
                except KeyError:
                    self.logger.warning("No vote info available, skipping")
                    continue

            try:
                chamber = chamber_dict[v["chamber"]]
            except KeyError:
                chamber = "lower" if "house" in v["apn"] else "upper"
            try:
                date = self._tz.localize(datetime.datetime.strptime(v["date"], "%m/%d/%y"))
                date = "{:%Y-%m-%d}".format(date)
            except KeyError:
                try:
                    date = self._tz.localize(datetime.datetime.strptime(v["occurred"], "%m/%d/%y"))
                    date = "{:%Y-%m-%d}".format(date)
                except KeyError:
                    self.logger.warning("No date found for vote, skipping")
                    continue
            try:
                motion = v["action"]
            except KeyError:
                motion = v["motiontype"]

            # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip
            if (not motion and isinstance(v['yeas'], str)
               and isinstance(v['nays'], str)):
                waringText = 'Malformed JSON found for vote ("revno" of {}); skipping'
                self.warning(waringText.format(v['revno']))
                continue

            result = v.get("results") or v.get("passed")
            if result is None:
                if len(v['yeas']) > len(v['nays']):
                    result = "passed"
                else:
                    result = "failed"

            passed = vote_results[result.lower()]
            if "committee" in v:
                vote = VoteEvent(chamber=chamber,
                                 start_date=date,
                                 motion_text=motion,
                                 result='pass' if passed else 'fail',
                                 # organization=v["committee"],
                                 bill=bill,
                                 classification='passed'
                                 )
            else:
                vote = VoteEvent(chamber=chamber,
                                 start_date=date,
                                 motion_text=motion,
                                 result='pass' if passed else 'fail',
                                 classification='passed',
                                 bill=bill
                                 )
            vote.pupa_id = str(v['revno'])
            # the yea and nay counts are not displayed, but vote totals are
            # and passage status is.
            yes_count = 0
            no_count = 0
            absent_count = 0
            excused_count = 0
            for voter_id in v["yeas"]:
                vote.yes(legislators[voter_id])
                yes_count += 1
            for voter_id in v["nays"]:
                vote.no(legislators[voter_id])
                no_count += 1
            if "absent" in v:
                for voter_id in v["absent"]:
                    vote.vote('absent', legislators[voter_id])
                    absent_count += 1
            if "excused" in v:
                for voter_id in v["excused"]:
                    vote.vote('excused', legislators[voter_id])
                    excused_count += 1

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('absent', absent_count)
            vote.set_count('excused', excused_count)
            # check to see if there are any other things that look
            # like vote categories, throw a warning if so
            for key, val in v.items():
                if (type(val) == list and len(val) > 0 and
                   key not in ["yeas", "nays", "absent", "excused"]):
                    if val[0] in legislators:
                        self.logger.warning("{k} looks like a vote type that's not being counted."
                                            " Double check it?".format(k=key))
            vote.add_source(url)

            yield vote
Ejemplo n.º 35
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " "))

        seen_rcs = set()

        re_ns = "http://exslt.org/regular-expressions"
        path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={"re": re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if "HOUSE" in header.xpath("string()"):
                chamber = "lower"
                motion_index = 8
            else:
                chamber = "upper"
                motion_index = 13

            motion = header.xpath(
                "string(following-sibling::p[%d])" % motion_index
            ).strip()
            motion = re.sub(r"\s+", " ", motion)
            if not motion.strip():
                self.warning("Motion text not found")
                return
            match = re.match(r"^(.*) (PASSED|FAILED)$", motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == "PASSED"
            else:
                passed = None

            rcs_p = header.xpath("following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ")
            rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1)

            if rcs in seen_rcs:
                continue
            else:
                seen_rcs.add(rcs)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r"\d+/\d+/\d+", date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace("\r\n", " ").strip()
                if "*****" in line:
                    break
                regex = (
                    r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL "
                    r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)"
                )
                match = re.match(regex, line)
                if match:
                    if match.group(1) == "YEAS" and "RCS#" not in line:
                        vtype = "yes"
                        seen_yes = True
                    elif match.group(1) == "NAYS" and seen_yes:
                        vtype = "no"
                    elif match.group(1) == "VACANT":
                        continue  # skip these
                    elif seen_yes:
                        vtype = "other"
                    if seen_yes and match.group(3).strip():
                        self.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split("   "):
                        if not name:
                            continue
                        if "HOUSE" in name or "SENATE " in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts["yes"] > (counts["no"] + counts["other"])

            vote = Vote(
                chamber=chamber,
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.set_count("yes", counts["yes"])
            vote.set_count("no", counts["no"])
            vote.set_count("other", counts["other"])
            vote.pupa_id = url + "#" + rcs

            vote.add_source(url)

            for name in votes["yes"]:
                vote.yes(name)
            for name in votes["no"]:
                if ":" in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes["other"]:
                vote.vote("other", name)

            yield vote
Ejemplo n.º 36
0
    def parse_vote_page(self, vote_url, bill):
        vote_html = self.get(vote_url).text
        doc = lxml.html.fromstring(vote_html)
        # chamber
        if 'senate' in vote_url:
            chamber = 'upper'
        else:
            chamber = 'lower'

        # date in the following format: Mar 23, 2009
        date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text
        date = date.replace(u'\xa0', ' ')
        date = datetime.datetime.strptime(date[18:], '%b %d, %Y')

        # motion
        motion = ''.join(x.text_content()
                         for x in doc.xpath('//td[@colspan="23"]'))
        if motion == '':
            motion = "No motion given"  # XXX: Double check this. See SJ 3.
        motion = motion.replace(u'\xa0', ' ')

        # totals
        tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get('class')
        totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:]
        yes_count = int(totals[0].split()[-1])
        no_count = int(totals[1].split()[-1])
        other_count = int(totals[2].split()[-1])
        other_count += int(totals[3].split()[-1])
        other_count += int(totals[4].split()[-1])
        passed = yes_count > no_count

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            classification='passage',
            result='pass' if passed else 'fail',
        )
        vote.pupa_id = vote_url  # contains sequence number
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)

        # go through, find Voting Yea/Voting Nay/etc. and next tds are voters
        func = None
        for td in doc.xpath('//td/text()'):
            td = td.replace(u'\xa0', ' ')
            if td.startswith('Voting Yea'):
                func = vote.yes
            elif td.startswith('Voting Nay'):
                func = vote.no
            elif td.startswith('Not Voting'):
                func = vote.other
            elif td.startswith('Excused'):
                func = vote.other
            elif func:
                td = td.rstrip('*')
                func(td)

        return vote
Ejemplo n.º 37
0
    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h3[contains(@id, 'hdVote')])")

        if 'No Bill Action' in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(', ')[1]

        if location.startswith('House'):
            chamber = 'lower'
        elif location.startswith('Senate'):
            chamber = 'upper'
        elif location.startswith('Joint'):
            chamber = 'legislature'
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        motion = ', '.join(header.split(', ')[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//span[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//span[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//span[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//span[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification=type,
                             bill=bill)
            # The vote page URL has a unique ID
            # However, some votes are "consent calendar" events,
            # and relate to the passage of _multiple_ bills
            # These can't be modeled yet in Pupa, but for now we can
            # append a bill ID to the URL that forms the `pupa_id`
            # https://github.com/opencivicdata/pupa/issues/308
            vote.pupa_id = '{}#{}'.format(url,
                                          bill.identifier.replace(' ', ''))

            vote.add_source(url)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', excused_count)
            vote.set_count('absent', absent_count)

            for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"):
                option_or_person = td.text.strip()
                if option_or_person in ('Aye', 'Yea'):
                    vote.yes(td.getprevious().text.strip())
                elif option_or_person == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif option_or_person == 'Excused':
                    vote.vote('excused', td.getprevious().text.strip())
                elif option_or_person == 'Absent':
                    vote.vote('absent', td.getprevious().text.strip())

            yield vote
Ejemplo n.º 38
0
    def _build_lower_votes(self):
        url = self.shared_url + '&Floor%26nbspVotes=Y'
        self.urls.add(votes=url)
        self.bill.add_source(url)
        doc = self.urls.votes.doc
        if doc is None:
            return

        # Grab bill information.
        try:
            pre = doc.xpath('//pre')[0].text_content().strip()

            no_votes = 'There are no votes for this bill in this legislative '

            if pre == no_votes:
                raise ValueError('No votes for this bill.')
        # Skip bill if votes can't be found.
        except (IndexError, ValueError):
            return

        for table in doc.xpath('//table'):

            date = table.xpath('caption/span[contains(., "DATE:")]')
            date = next(date[0].itersiblings()).text
            date = datetime.datetime.strptime(date, '%m/%d/%Y')
            date = date.replace(tzinfo=timezone('UTC'))

            spanText = table.xpath('caption/span/text()')
            motion = spanText[2].strip()+spanText[3].strip()

            votes = table.xpath('caption/span/span')[0].text.split(':')[1].split('/')
            yes_count, no_count = map(int, votes)
            passed = yes_count > no_count
            vote = VoteEvent(
                chamber='lower',
                start_date=date,
                motion_text=motion,
                bill=self.bill,
                result='pass' if passed else 'fail',
                classification='passage'
            )

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            absent_count = 0
            excused_count = 0
            tds = table.xpath('tr/td/text()')
            votes = [tds[i:i+2] for i in range(0, len(tds), 2)]

            vote_dictionary = {
                'Y': 'yes',
                'NO': 'no',
                'ER': 'excused',
                'AB': 'absent',
                'NV': 'not voting'
            }

            for vote_pair in votes:
                name, vote_val = vote_pair
                vote.vote(vote_dictionary[vote_val], name)
                if vote_val == 'AB':
                    absent_count += 1
                elif vote_val == 'ER':
                    excused_count += 1

            vote.set_count('absent', absent_count)
            vote.set_count('excused', excused_count)
            vote.add_source(url)
            vote.pupa_id = url + motion + spanText[1]

            yield vote
Ejemplo n.º 39
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower') or
                    (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id, source_url, media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime(
                    '%m/%d/%y')
                version_name = "{} - {}".format(
                    version_date_human, version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type='application/pdf',
                    date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    def replacer(matchobj):
                        if matchobj:
                            return {'Assembly': 'lower',
                                    'Senate': 'upper'}[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on', action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor,
                                           classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(
                        committee, entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = (
                    'http://leginfo.legislature.ca.gov/faces'
                    '/billVotesClient.xhtml?bill_id={}'
                ).format(fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
Ejemplo n.º 40
0
    def parse_vote(self, bill, link):
        # Server sometimes sends proper error headers,
        # sometimes not
        try:
            self.info("Get {}".format(link))
            text = requests.get(link).text
        except requests.exceptions.HTTPError as err:
            self.warning("{} fetching vote {}, skipping".format(err, link))
            return

        if 'Varnish cache server' in text:
            self.warning("Scrape rate is too high, try re-scraping with "
                         "The --rpm set to a lower number")
            return

        if 'Page Not Found' in text or 'Page Unavailable' in text:
            self.warning("missing vote, skipping")
            return
        member_doc = lxml.html.fromstring(text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        chamber_date_line = ''.join(member_doc.xpath("//div[@id='main_content']/h3[1]//text()"))
        chamber_date_line_words = chamber_date_line.split()
        vote_chamber = chamber_date_line_words[0]
        vote_date = datetime.datetime.strptime(chamber_date_line_words[-1], '%m/%d/%Y')
        vote_status = " ".join(chamber_date_line_words[2:-2])
        opinions = member_doc.xpath("//div[@id='main_content']/h3[position() > 1]/text()")
        if len(opinions) > 0:
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except ValueError:
                    # This is likely not a vote-count text chunk
                    # It's probably '`On roll call the vote was:`
                    pass
                else:
                    if "yea" in i.lower():
                        yes_count = count
                    elif "nay" in i.lower():
                        no_count = count
                    elif "present" in i.lower():
                        p_count = count
                    elif "absent" in i.lower():
                        a_count = count

            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime('%Y-%m-%d'),
                chamber=vote_chamber,
                motion_text=vote_status,
                result='pass' if yes_count > no_count else 'fail',
                classification='passage',
            )
            vote.pupa_id = link

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('abstain', p_count)
            vote.set_count('absent', a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote('yes', re.sub(',', '', a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote('no', re.sub(',', '', a_links[i]).split()[0])
                else:
                    vote.vote('other', re.sub(',', '', a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Ejemplo n.º 41
0
    def scrape_vote(self, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath("//font/text()")[2]
        except IndexError:
            self.warning("Vote Summary Page Broken ")
            return

        # eg. http://leg.colorado.gov/content/sb18-033vote563ce6
        if ('AM' in motion or 'PM' in motion) and '/' in motion:
            motion = "Motion not given."

        if 'withdrawn' not in motion:
            yes_no_counts = page.xpath("//tr/td[preceding-sibling::td/descendant::"
                                       "font[contains(text(),'Aye')]]/font/text()")
            other_counts = page.xpath("//tr/td[preceding-sibling::td/descendant::"
                                      "font[contains(text(),'Absent')]]/font/text()")
            abstain_counts = page.xpath("//tr/td[preceding-sibling::td/descendant::"
                                        "font[contains(text(),'17C')]]/font/text()")
            yes_count = int(yes_no_counts[0])
            no_count = int(yes_no_counts[2])
            exc_count = int(other_counts[2])
            absent_count = int(other_counts[0])
            abstain_count = 0
            if abstain_counts:
                abstain_count = int(abstain_counts[0])

            # fix for
            # http://leg.colorado.gov/content/hb19-1029vote65e72e
            if absent_count == -1:
                absent_count = 0

            passed = yes_count > no_count
            vote = VoteEvent(chamber=chamber,
                             start_date=self._tz.localize(date),
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             bill=bill,
                             classification='passage',
                             )
            vote.pupa_id = vote_url
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', exc_count)
            vote.set_count('absent', absent_count)
            vote.set_count('abstain', abstain_count)
            vote.add_source(vote_url)

            rolls = page.xpath("//tr[preceding-sibling::tr/descendant::"
                               "td/div/b/font[contains(text(),'Vote')]]")

            vote_abrv = {'Y': 'yes', 'N': 'no', 'E': 'excused', 'A': 'absent',
                         '-': 'absent', '17C': 'abstain'}
            for roll in rolls:
                voted = roll.xpath(".//td/div/font/text()")[0].strip()
                voter = roll.xpath(".//td/font/text()")[0].strip()
                if voted == 'V':
                    continue
                vote.vote(vote_abrv[voted], voter)
            yield vote
Ejemplo n.º 42
0
    def scrape_votes(self, bill_page, page_url, bill, insert, year):
        root = lxml.html.fromstring(bill_page)
        trs = root.xpath('/html/body/div/table[6]//tr')
        assert len(trs) >= 1, "Didn't find the Final Passage Votes' table"

        for tr in trs[1:]:
            links = tr.xpath('td/a[contains(text(), "Passage")]')
            if len(links) == 0:
                self.warning("Non-passage vote found for {}; ".format(bill.identifier) +
                             "probably a motion for the calendar. It will be skipped.")
            else:
                assert len(links) == 1, \
                    "Too many votes found for XPath query, on bill {}".format(bill.identifier)
                link = links[0]

            motion = link.text
            if 'Assembly' in motion:
                chamber = 'lower'
            else:
                chamber = 'upper'

            votes = {}
            tds = tr.xpath('td')
            for td in tds:
                if td.text:
                    text = td.text.strip()
                    date = re.match('... .*?, ....', text)
                    count = re.match('(?P<category>.*?) (?P<votes>[0-9]+)[,]?', text)
                    if date:
                        vote_date = datetime.strptime(text, '%b %d, %Y')
                    elif count:
                        votes[count.group('category')] = int(count.group('votes'))

            yes = votes['Yea']
            no = votes['Nay']
            excused = votes['Excused']
            not_voting = votes['Not Voting']
            absent = votes['Absent']
            other = excused + not_voting + absent
            passed = yes > no

            vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(vote_date),
                             motion_text=motion, result='pass' if passed else 'fail',
                             classification='passage', bill=bill,
                             )
            vote.set_count('yes', yes)
            vote.set_count('no', no)
            vote.set_count('other', other)
            vote.set_count('not voting', not_voting)
            vote.set_count('absent', absent)
            # try to get vote details
            try:
                vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (
                    insert, link.get('href'))
                vote.pupa_id = vote_url
                vote.add_source(vote_url)

                if vote_url in self._seen_votes:
                    self.warning('%s is included twice, skipping second', vote_url)
                    continue
                else:
                    self._seen_votes.add(vote_url)

                page = self.get(vote_url).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                for el in root.xpath('//table[2]/tr'):
                    tds = el.xpath('td')
                    name = tds[1].text_content().strip()
                    vote_result = tds[2].text_content().strip()

                    if vote_result == 'Yea':
                        vote.yes(name)
                    elif vote_result == 'Nay':
                        vote.no(name)
                    else:
                        vote.vote('other', name)
                vote.add_source(page_url)
            except scrapelib.HTTPError:
                self.warning("failed to fetch vote page, adding vote without details")

            yield vote
Ejemplo n.º 43
0
    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h4[contains(@id, 'hdVote')])")

        if 'No Bill Action' in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(', ')[1]

        if location.startswith('House'):
            chamber = 'lower'
        elif location.startswith('Senate'):
            chamber = 'upper'
        elif location.startswith('Joint'):
            chamber = 'legislature'
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        # committee = ' '.join(location.split(' ')[1:]).strip()
        # if not committee or committee.startswith('of Representatives'):
        #     committee = None

        motion = ', '.join(header.split(', ')[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//td[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//td[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//td[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//td[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification=type,
                             bill=bill
                             )
            vote.pupa_id = url      # vote id is in URL

            vote.add_source(url)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', excused_count)
            vote.set_count('absent', absent_count)

            for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"):
                if td.text in ('Aye', 'Yea'):
                    vote.yes(td.getprevious().text.strip())
                elif td.text == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif td.text == 'Excused':
                    vote.vote('excused', td.getprevious().text.strip())
                elif td.text == 'Absent':
                    vote.vote('absent', td.getprevious().text.strip())

            yield vote
Ejemplo n.º 44
0
    def scrape_votes(self, session):
        votes = {}
        other_counts = defaultdict(int)
        last_line = []
        vote_url = 'http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt'
        lines = self.get(vote_url).content.decode('utf-8').splitlines()

        for line in lines:

            if len(line) < 2:
                continue

            if line.strip() == "":
                continue

            line = line.split('|')
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning('used bad vote line')
                else:
                    last_line = line
                    self.warning('bad vote line %s' % '|'.join(line))
            session_yr = line[0].replace('\xef\xbb\xbf', '')
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            # present = int(line[7])
            # absent = int(line[8])
            motion = line[11].strip() or '[not available]'

            if session_yr == session and bill_id in self.bills_by_id:
                actor = 'lower' if body == 'H' else 'upper'
                time = dt.datetime.strptime(timestamp,
                                            '%m/%d/%Y %I:%M:%S %p')
                time = pytz.timezone('America/New_York').localize(time).isoformat()
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(chamber=actor,
                            start_date=time,
                            motion_text=motion,
                            result='pass' if passed else 'fail',
                            classification='passage',
                            bill=self.bills_by_id[bill_id])
                vote.set_count('yes', yeas)
                vote.set_count('no', nays)
                vote.add_source(vote_url)
                vote.pupa_id = session_yr + body + vote_num     # unique ID for vote
                votes[body+vote_num] = vote

        for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt') \
                        .content.decode('utf-8').splitlines():
            if len(line) < 2:
                continue

            # 2016|H|2|330795||Yea|
            # 2012    | H   | 2    | 330795  | 964 |  HB309  | Yea | 1/4/2012 8:27:03 PM
            session_yr, body, v_num, _, employee, bill_id, vote, date = \
                line.split('|')

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = " ".join(self.legislators[employee]['name'].split())
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body+v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body+v_num))
                    continue
                # code = self.legislators[employee]['seat']

                if vote == 'Yea':
                    votes[body+v_num].yes(leg)
                elif vote == 'Nay':
                    votes[body+v_num].no(leg)
                else:
                    votes[body+v_num].vote('other', leg)
                    # hack-ish, but will keep the vote count sync'd
                    other_counts[body+v_num] += 1
                    votes[body+v_num].set_count('other', other_counts[body+v_num])
        for vid, vote in votes.items():
            yield vote
Ejemplo n.º 45
0
    def _parse_votes(self, url, vote, bill):
        '''Given a vote url and a vote object, extract the voters and
        the vote counts from the vote page and update the vote object.
        '''
        if url.lower().endswith('.pdf'):

            try:
                resp = self.get(url)
            except HTTPError:
                # This vote document wasn't found.
                msg = 'No document found at url %r' % url
                self.logger.warning(msg)
                return

            try:
                v = PDFCommitteeVote(url, resp.content, bill)
                return v.asvote()
            except PDFCommitteeVoteParseError:
                # Warn and skip.
                self.warning("Could't parse committee vote at %r" % url)
                return

        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # Yes, no, excused, absent.
        try:
            vals = doc.xpath('//table')[1].xpath('tr/td/text()')
        except IndexError:
            # Most likely was a bogus link lacking vote data.
            return

        yes_count, no_count, excused_count, absent_count = map(int, vals)

        # Get the motion.
        try:
            motion = doc.xpath('//br')[-1].tail.strip()
        except IndexError:
            # Some of them mysteriously have no motion listed.
            motion = vote['action']

        if not motion:
            motion = vote['action']

        vote['motion'] = motion

        action = vote['action']
        vote_url = vote['vote_url']

        vote = VoteEvent(
            chamber=vote['chamber'],
            start_date=vote['date'],
            motion_text=vote['motion'],
            result='fail',      # placeholder
            classification='passage',
            bill=bill,
            bill_action=vote['action'],
        )
        vote.pupa_id = vote_url         # URL contains sequence number
        vote.add_source(vote_url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('excused', excused_count)
        vote.set_count('absent', absent_count)

        for text in doc.xpath('//table')[2].xpath('tr/td/text()'):
            if not text.strip(u'\xa0'):
                continue
            v, name = filter(None, text.split(u'\xa0'))
            # Considering Name is brackets as short name
            regex = re.compile(r".*?\((.*?)\)")
            short_name = re.findall(regex, name)
            if len(short_name) > 0:
                note = 'Short Name: ' + short_name[0]
            else:
                note = ''
            # Name without brackets like 'Kary, Douglas'
            name = re.sub(r"[\(\[].*?[\)\]]", "", name)
            if v == 'Y':
                vote.yes(name, note=note)
            elif v == 'N':
                vote.no(name, note=note)
            elif v == 'E':
                vote.vote('excused', name, note=note)
            elif v == 'A':
                vote.vote('absent', name, note=note)

        # code to deterimine value of `passed`
        passed = None

        # some actions take a super majority, so we aren't just
        # comparing the yeas and nays here.
        for i in vote_passage_indicators:
            if i in action:
                passed = True
                break
        for i in vote_failure_indicators:
            if i in action and passed:
                # a quick explanation:  originally an exception was
                # thrown if both passage and failure indicators were
                # present because I thought that would be a bug in my
                # lists.  Then I found 2007 HB 160.
                # Now passed = False if the nays outnumber the yays..
                # I won't automatically mark it as passed if the yays
                # ounumber the nays because I don't know what requires
                # a supermajority in MT.
                if no_count >= yes_count:
                    passed = False
                    break
                else:
                    raise Exception("passage and failure indicator"
                                    "both present at: %s" % url)
            if i in action and passed is None:
                passed = False
                break
        for i in vote_ambiguous_indicators:
            if i in action:
                passed = yes_count > no_count
                break
        if passed is None:
            raise Exception("Unknown passage at: %s" % url)

        vote.result = 'pass' if passed else 'fail'

        return vote
Ejemplo n.º 46
0
    def scrape_vote(self, bill, vote_id, session):
        vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId'
        form = {
            'rollCallId': vote_id,
            'sort': '',
            'group': '',
            'filter': '',
        }

        self.info('Fetching vote {} for {}'.format(vote_id, bill.identifier))
        page = self.post(url=vote_url, data=form, allow_redirects=True).json()
        if page:
            roll = page['Model']
            vote_chamber = self.chamber_map[roll['ChamberName']]
            # "7/1/16 01:00 AM"
            vote_date = dt.datetime.strptime(roll['TakenAtDateTime'],
                                             '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d')

            # TODO: What does this code mean?
            vote_motion = roll['RollCallVoteType']

            vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail'
            other_count = (int(roll['NotVotingCount']) +
                           int(roll['VacantVoteCount']) +
                           int(roll['AbsentVoteCount']) +
                           int(roll['ConflictVoteCount'])
                           )
            vote = VoteEvent(chamber=vote_chamber,
                             start_date=vote_date,
                             motion_text=vote_motion,
                             result=vote_passed,
                             classification='other',
                             bill=bill,
                             legislative_session=session
                             )
            vote_pdf_url = 'https://legis.delaware.gov' \
                '/json/RollCallController/GenerateRollCallPdf' \
                '?rollCallId={}&chamberId={}'.format(vote_id, self.chamber_codes[vote_chamber])
            # Vote URL is just a generic search URL with POSTed data,
            # so provide a different link
            vote.add_source(vote_pdf_url)
            vote.pupa_id = vote_pdf_url
            vote.set_count('yes', roll['YesVoteCount'])
            vote.set_count('no', roll['NoVoteCount'])
            vote.set_count('other', other_count)

            for row in roll['AssemblyMemberVotes']:
                # AssemblyMemberId looks like it should work here,
                # but for some sessions it's bugged to only return session
                try:
                    voter = self.legislators_by_short[str(row['ShortName'])]
                    name = voter['DisplayName']
                except KeyError:
                    self.warning('could not find legislator short name %s',
                                 row['ShortName'])
                    name = row['ShortName']
                if row['SelectVoteTypeCode'] == 'Y':
                    vote.yes(name)
                elif row['SelectVoteTypeCode'] == 'N':
                    vote.no(name)
                else:
                    vote.vote('other', name)

            yield vote
Ejemplo n.º 47
0
    def scrape_vote_history(self, bill, vurl):
        """
         Obtain the information on a vote and link it to the related Bill
        :param bill: related bill
        :param vurl: source for the voteEvent information.
        :return: voteEvent object
        """
        html = self.get(vurl).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(vurl)

        # skip first two rows
        for row in doc.xpath('//table/tr')[2:]:
            tds = row.getchildren()
            if len(tds) != 11:
                self.warning('irregular vote row: %s' % vurl)
                continue
            timestamp, motion, vote, yeas, nays, nv, exc, pres, abst, total, result = tds

            timestamp = timestamp.text.replace(u'\xa0', ' ')
            timestamp = datetime.datetime.strptime(timestamp,
                                                   '%m/%d/%Y %H:%M %p')

            yeas = int(yeas.text)
            nays = int(nays.text)
            others = int(nv.text) + int(exc.text) + \
                int(abst.text) + int(pres.text)
            assert yeas + nays + others == int(total.text)

            if result.text == 'Passed':
                passed = 'pass'
            else:
                passed = 'fail'

            vote_link = vote.xpath('a')[0]
            if '[H]' in vote_link.text:
                chamber = 'lower'
            else:
                chamber = 'upper'

            vote = VoteEvent(
                chamber=chamber,  # 'upper' or 'lower'
                start_date=timestamp.strftime(
                    '%Y-%m-%d'),  # 'YYYY-MM-DD' format
                motion_text=motion.text,
                result=passed,
                classification='passage',  # Can also be 'other'

                # Provide a Bill instance to link with the VoteEvent...
                bill=bill,
            )

            vote.set_count('yes', yeas)
            vote.set_count('no', nays)
            vote.set_count('other', others)

            vote.add_source(vurl)

            # obtain vote rollcall from pdf and add it to the VoteEvent object
            rollcall_pdf = vote_link.get('href')
            self.scrape_rollcall(vote, rollcall_pdf)
            vote.add_source(rollcall_pdf)
            if rollcall_pdf in self._seen_vote_ids:
                self.warning('duplicate usage of %s, skipping', rollcall_pdf)
                continue
            else:
                self._seen_vote_ids.add(rollcall_pdf)
            vote.pupa_id = rollcall_pdf     # distinct KEY for each one

            yield vote
Ejemplo n.º 48
0
    def parse_vote(self, bill, action, act_chamber, act_date, url):
        re_vote_text = re.compile(r'The question (?:being|to be reconsidered):\s*"(.*?\?)"', re.S)
        re_header = re.compile(r'\d{2}-\d{2}-\d{4}\s{10,}\w{,20} Journal\s{10,}\d{,6}\s{,4}')

        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        if len(doc.xpath('//pre')) < 2:
            return

        # Find all chunks of text representing voting reports.
        votes_text = doc.xpath('//pre')[1].text_content()
        votes_text = re_vote_text.split(votes_text)
        votes_data = zip(votes_text[1::2], votes_text[2::2])

        iVoteOnPage = 0

        # Process each.
        for motion, text in votes_data:

            iVoteOnPage += 1
            yes = no = other = 0

            tally = re.findall(r'\b([YNEA])[A-Z]+:\s{,3}(\d{,3})', text)
            for vtype, vcount in tally:
                vcount = int(vcount) if vcount != '-' else 0
                if vtype == 'Y':
                    yes = vcount
                elif vtype == 'N':
                    no = vcount
                else:
                    other += vcount

            vote = VoteEvent(
                bill=bill,
                start_date=act_date.strftime('%Y-%m-%d'),
                chamber=act_chamber,
                motion_text=motion,
                result='pass' if yes > no else 'fail',
                classification='passage',
            )
            vote.set_count('yes', yes)
            vote.set_count('no', no)
            vote.set_count('other', other)

            vote.pupa_id = (url + ' ' + str(iVoteOnPage)) if iVoteOnPage > 1 else url

            # In lengthy documents, the "header" can be repeated in the middle
            # of content. This regex gets rid of it.
            vote_lines = re_header.sub('', text)
            vote_lines = vote_lines.split('\r\n')

            vote_type = None
            for vote_list in vote_lines:
                if vote_list.startswith('Yeas: '):
                    vote_list, vote_type = vote_list[6:], 'yes'
                elif vote_list.startswith('Nays: '):
                    vote_list, vote_type = vote_list[6:], 'no'
                elif vote_list.startswith('Excused: '):
                    vote_list, vote_type = vote_list[9:], 'other'
                elif vote_list.startswith('Absent: '):
                    vote_list, vote_type = vote_list[9:], 'other'
                elif vote_list.strip() == '':
                    vote_type = None
                if vote_type:
                    for name in vote_list.split(','):
                        name = name.strip()
                        if name:
                            vote.vote(vote_type, name)

            vote.add_source(url)

            yield vote
Ejemplo n.º 49
0
    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h3[contains(@id, 'hdVote')])")

        if 'No Bill Action' in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(', ')[1]

        if location.startswith('House'):
            chamber = 'lower'
        elif location.startswith('Senate'):
            chamber = 'upper'
        elif location.startswith('Joint'):
            chamber = 'legislature'
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        motion = ', '.join(header.split(', ')[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//span[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//span[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//span[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//span[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification=type,
                             bill=bill
                             )
            # The vote page URL has a unique ID
            # However, some votes are "consent calendar" events,
            # and relate to the passage of _multiple_ bills
            # These can't be modeled yet in Pupa, but for now we can
            # append a bill ID to the URL that forms the `pupa_id`
            # https://github.com/opencivicdata/pupa/issues/308
            vote.pupa_id = '{}#{}'.format(url, bill.identifier.replace(' ', ''))

            vote.add_source(url)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', excused_count)
            vote.set_count('absent', absent_count)

            for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"):
                option_or_person = td.text.strip()
                if option_or_person in ('Aye', 'Yea'):
                    vote.yes(td.getprevious().text.strip())
                elif option_or_person == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif option_or_person == 'Excused':
                    vote.vote('excused', td.getprevious().text.strip())
                elif option_or_person == 'Absent':
                    vote.vote('absent', td.getprevious().text.strip())

            yield vote
Ejemplo n.º 50
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type='text').decode()
        lines = text.splitlines()

        if 'Senate' in vote_url:
            chamber = 'upper'
        else:
            chamber = 'lower'

        date_string = lines[0].split('Calendar Date:')[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if 'Yeas' in line and 'Nays' in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ['yes', 'no', 'not voting', 'excused', 'absent']

        if page_index:

            counts = re.split(r'\s{2,}', lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(' ', 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ['Consent Calendar' in line for line in lines[:page_index]])
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r'\s{2,}', lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r'\s{2,}',
                                              lines[page_index - 1].strip())
            assert consent_calendar_bills, "Could not find bills for consent calendar vote"

        motion_keywords = [
            'favorable', 'reading', 'amendment', 'motion', 'introduced',
            'bill pass', 'committee'
        ]
        motion_lines = [
            3, 2, 4, 5
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(motion_keyword in motion.lower()
                   for motion_keyword in motion_keywords):
                break
            motion = re.split(r'\s{2,}', lines[page_index - i].strip())[0]
        else:
            if not any(motion_keyword in motion.lower()
                       for motion_keyword in motion_keywords):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(motion_keyword in motion.lower()
                       for motion_keyword in motion_keywords):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            classification='passage',
            result='pass' if passed else 'fail',
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = '{}#{}'.format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            'Voting Nay', 'Not Voting', 'COPY', 'Excused',
            'indicates vote change'
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or 'Voting Yea' in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line
                   for show_stopper in show_stoppers):
                page_index += 1
                vote_index = (vote_index + 1)
                continue

            names = re.split(r'\s{2,}', current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Ejemplo n.º 51
0
    def scrape_action_page(self, bill, page):
        action_rows = page.xpath('//tbody/tr')
        for row in action_rows:
            action_date = row.xpath('td[1]/text()')[0]
            action_date = datetime.strptime(action_date, '%m/%d/%Y')
            action_year = action_date.year
            action_date = action_date.strftime('%Y-%m-%d')

            if row.xpath('td[2]/text()'):
                action_actor = row.xpath('td[2]/text()')[0]
                action_actor = self.chamber_map_reverse[action_actor.strip()]

            action_name = row.xpath('string(td[3])')

            # House votes
            if "Supplement" in action_name:
                actor = "lower"
                vote_action = re.findall(r'(.+)-\s*\d+\s*YEAS', action_name)[0].strip()

                y = int(re.findall(r'(\d+)\s*YEAS', action_name)[0])
                n = int(re.findall(r'(\d+)\s*NAYS', action_name)[0])

                # get supplement number
                n_supplement = int(re.findall(r'No\.\s*(\d+)', action_name)[0])
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result='pass' if y > n else 'fail',
                    classification='passage',
                    bill=bill,
                )
                cached_vote.set_count('yes', y)
                cached_vote.set_count('no', n)

                housevote_pdf = 'https://malegislature.gov/Journal/House/{}/{}/RollCalls'.format(
                    bill.legislative_session, action_year)
                self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement)
                cached_vote.add_source(housevote_pdf)

                cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement)

                # XXX: disabled house votes on 8/1 to try to get MA importing again
                # will leaving this in and commented out once we resolve the ID issue
                # yield cached_vote

            # Senate votes
            if "Roll Call" in action_name:
                actor = "upper"
                # placeholder
                vote_action = action_name.split(' -')[0]
                try:
                    y, n = re.search(r'(\d+) yeas .*? (\d+) nays', action_name.lower()).groups()
                    y = int(y)
                    n = int(n)
                except AttributeError:
                    y = int(re.search(r"yeas\s+(\d+)", action_name.lower()).group(1))
                    n = int(re.search(r"nays\s+(\d+)", action_name.lower()).group(1))

                # TODO: other count isn't included, set later
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result='pass' if y > n else 'fail',
                    classification='passage',
                    bill=bill,
                )
                cached_vote.set_count('yes', y)
                cached_vote.set_count('no', n)

                rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)')
                self.scrape_senate_vote(cached_vote, rollcall_pdf)
                cached_vote.add_source(rollcall_pdf)
                cached_vote.pupa_id = rollcall_pdf
                # XXX: also disabled, see above note
                # yield cached_vote

            attrs = self.categorizer.categorize(action_name)
            action = bill.add_action(
                action_name.strip(),
                action_date,
                chamber=action_actor,
                classification=attrs['classification'],
            )
            for com in attrs.get('committees', []):
                action.add_related_entity(com, entity_type='organization')
Ejemplo n.º 52
0
    def scrape_bill_type(self,
                         chamber,
                         session,
                         bill_type,
                         type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower')
                    or (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime('%m/%d/%y')
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(version_name,
                                        version_url_pdf,
                                        media_type='application/pdf',
                                        date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                'Assembly': 'lower',
                                'Senate': 'upper'
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on',
                             action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime('%Y-%m-%d'),
                    chamber=actor,
                    classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(committee,
                                              entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = ('http://leginfo.legislature.ca.gov/faces'
                              '/billVotesClient.xhtml?bill_id={}').format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
Ejemplo n.º 53
0
    def parse_roll_call(self, bill, link, chamber, date):
        url = link.attrib['href']
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        xpath = 'string(//div[@class="Column-OneFourth"]/div[3])'
        motion = page.xpath(xpath).strip()
        motion = re.sub(r'\s+', ' ', motion)

        if motion == 'FP':
            motion = 'FINAL PASSAGE'

        if motion == 'FINAL PASSAGE':
            type = 'passage'
        elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion):
            type = 'amendment'
        else:
            type = 'other'
            motion = link.text_content()

        yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text)
        nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text)
        lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text)
        nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text)
        other = lve + nv

        vote = VoteEvent(
            chamber=chamber,
            start_date=tz.localize(date),
            motion_text=motion,
            classification=type,
            result='pass' if yeas > (nays + other) else 'fail',
            bill=bill,
        )
        # pupa_id situation here is a bit weird, same vote can be used for
        # multiple bills see:
        # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11       # noqa
        # so we toss the bill id onto the end of the URL
        vote.pupa_id = url + '#' + bill.identifier
        vote.add_source(url)
        vote.set_count('yes', yeas)
        vote.set_count('no', nays)
        vote.set_count('other', other)

        for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'):
            name = div.text_content().strip()
            name = re.sub(r'^[\s,]+', '', name)
            name = re.sub(r'[\s,]+$', '', name)
            class_attr = div.attrib['class'].lower()
            if 'yea' in class_attr:
                voteval = 'yes'
            elif 'nay' in class_attr:
                voteval = 'no'
            elif 'nvote' in class_attr:
                voteval = 'other'
            elif 'lve' in class_attr:
                voteval = 'other'
            else:
                msg = 'Unrecognized vote val: %s' % class_attr
                raise Exception(msg)
            vote.vote(voteval, name)

        return vote
Ejemplo n.º 54
0
    def scrape_pdf_for_votes(self, session, actor, date, motion, href):
        warned = False
        # vote indicator, a few spaces, a name, newline or multiple spaces
        # VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})')
        COUNT_RE = re.compile(
            r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$'
        )
        PASS_FAIL_WORDS = {
            'PASSED': 'pass',
            'PREVAILED': 'fail',
            'ADOPTED': 'pass',
            'CONCURRED': 'pass',
            'FAILED': 'fail',
            'LOST': 'fail',
        }

        pdflines = self.fetch_pdf_lines(href)

        if not pdflines:
            return False

        yes_count = no_count = present_count = 0
        yes_votes = []
        no_votes = []
        present_votes = []
        excused_votes = []
        not_voting = []
        absent_votes = []
        passed = None
        counts_found = False
        vote_lines = []
        for line in pdflines:
            # consider pass/fail as a document property instead of a result of the vote count
            # extract the vote count from the document instead of just using counts of names
            if not line.strip():
                continue
            elif line.strip() in PASS_FAIL_WORDS:
                # Crash on duplicate pass/fail status that differs from previous status
                if passed is not None and passed != PASS_FAIL_WORDS[line.strip()]:
                    raise Exception("Duplicate pass/fail matches in [%s]" % href)
                passed = PASS_FAIL_WORDS[line.strip()]
            elif COUNT_RE.match(line):
                (yes_count, no_count, present_count,
                 not_voting_count) = COUNT_RE.match(line).groups()
                yes_count = int(yes_count)
                no_count = int(no_count)
                present_count = int(present_count)
                counts_found = True
            elif counts_found:
                for value in VOTE_VALUES:
                    if re.search(r'^\s*({})\s+\w'.format(value), line):
                        vote_lines.append(line)
                        break

        votes = find_columns_and_parse(vote_lines)
        for name, vcode in votes.items():
            if name == 'Mr. Speaker':
                name = session_details[session]['speaker']
            elif name == 'Mr. President':
                name = session_details[session]['president']
            else:
                # Converts "Davis,William" to "Davis, William".
                name = re.sub(r'\,([a-zA-Z])', r', \1', name)

            if vcode == 'Y':
                yes_votes.append(name)
            elif vcode == 'N':
                no_votes.append(name)
            elif vcode == 'P':
                present_votes.append(name)
            elif vcode == 'E':
                excused_votes.append(name)
            elif vcode == 'NV':
                not_voting.append(name)
            elif vcode == 'A':
                absent_votes.append(name)

        # fake the counts
        if yes_count == 0 and no_count == 0 and present_count == 0:
            yes_count = len(yes_votes)
            no_count = len(no_votes)
        else:  # audit
            if yes_count != len(yes_votes):
                self.warning("Mismatched yes count [expect: %i] [have: %i]" %
                             (yes_count, len(yes_votes)))
                warned = True
            if no_count != len(no_votes):
                self.warning("Mismatched no count [expect: %i] [have: %i]" %
                             (no_count, len(no_votes)))
                warned = True

        if passed is None:
            if actor['classification'] == 'lower':  # senate doesn't have these lines
                self.warning("No pass/fail word found; fall back to comparing yes and no vote.")
                warned = True
            passed = 'pass' if yes_count > no_count else 'fail'

        classification, _ = _categorize_action(motion)
        vote_event = VoteEvent(legislative_session=session,
                               motion_text=motion,
                               classification=classification,
                               organization=actor,
                               start_date=date,
                               result=passed)
        for name in yes_votes:
            vote_event.yes(name)
        for name in no_votes:
            vote_event.no(name)
        for name in present_votes:
            vote_event.vote('other', name)
        for name in excused_votes:
            vote_event.vote('excused', name)
        for name in not_voting:
            vote_event.vote('not voting', name)
        for name in absent_votes:
            vote_event.vote('absent', name)

        vote_event.set_count('yes', yes_count)
        vote_event.set_count('no', no_count)
        vote_event.set_count('other', present_count)
        vote_event.set_count('excused', len(excused_votes))
        vote_event.set_count('absent', len(absent_votes))
        vote_event.set_count('not voting', len(not_voting))

        vote_event.add_source(href)

        # for distinguishing between votes with the same id and on same day
        vote_event.pupa_id = href

        if warned:
            self.warning("Warnings were issued. Best to check %s" % href)
        return vote_event
Ejemplo n.º 55
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' '))

        seen_rcs = set()

        re_ns = "http://exslt.org/regular-expressions"
        path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={'re': re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if 'HOUSE' in header.xpath("string()"):
                chamber = 'lower'
                motion_index = 8
            else:
                chamber = 'upper'
                motion_index = 13

            motion = header.xpath(
                "string(following-sibling::p[%d])" % motion_index).strip()
            motion = re.sub(r'\s+', ' ', motion)
            if not motion.strip():
                self.warning("Motion text not found")
                return
            match = re.match(r'^(.*) (PASSED|FAILED)$', motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == 'PASSED'
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ')
            rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1)

            if rcs in seen_rcs:
                continue
            else:
                seen_rcs.add(rcs)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r'\d+/\d+/\d+', date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace('\r\n', ' ').strip()
                if "*****" in line:
                    break
                regex = (r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL '
                         'PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)')
                match = re.match(regex, line)
                if match:
                    if match.group(1) == 'YEAS' and 'RCS#' not in line:
                        vtype = 'yes'
                        seen_yes = True
                    elif match.group(1) == 'NAYS' and seen_yes:
                        vtype = 'no'
                    elif match.group(1) == 'VACANT':
                        continue  # skip these
                    elif seen_yes:
                        vtype = 'other'
                    if seen_yes and match.group(3).strip():
                        self.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split('   '):
                        if not name:
                            continue
                        if 'HOUSE' in name or 'SENATE ' in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts['yes'] > (counts['no'] + counts['other'])

            vote = Vote(chamber=chamber,
                        start_date=date.strftime('%Y-%m-%d'),
                        motion_text=motion,
                        result='pass' if passed else 'fail',
                        bill=bill,
                        classification='passage')
            vote.set_count('yes', counts['yes'])
            vote.set_count('no', counts['no'])
            vote.set_count('other', counts['other'])
            vote.pupa_id = url + '#' + rcs

            vote.add_source(url)

            for name in votes['yes']:
                vote.yes(name)
            for name in votes['no']:
                if ':' in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes['other']:
                vote.vote('other', name)

            yield vote
Ejemplo n.º 56
0
    def parse_vote(self, bill, link):
        # Server sometimes sends proper error headers,
        # sometimes not
        try:
            self.info("Get {}".format(link))
            text = requests.get(link).text
        except requests.exceptions.HTTPError as err:
            self.warning("{} fetching vote {}, skipping".format(err, link))
            return

        if "Varnish cache server" in text:
            self.warning("Scrape rate is too high, try re-scraping with "
                         "The --rpm set to a lower number")
            return

        if "Page Not Found" in text or "Page Unavailable" in text:
            self.warning("missing vote, skipping")
            return
        member_doc = lxml.html.fromstring(text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        chamber_date_line = "".join(
            member_doc.xpath("//div[@id='main_content']/h3[1]//text()"))
        chamber_date_line_words = chamber_date_line.split()
        vote_chamber = chamber_date_line_words[0]
        vote_date = datetime.datetime.strptime(chamber_date_line_words[-1],
                                               "%m/%d/%Y")
        vote_status = " ".join(chamber_date_line_words[2:-2])
        opinions = member_doc.xpath(
            "//div[@id='main_content']/h3[position() > 1]/text()")
        if len(opinions) > 0:
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = "upper" if vote_chamber == "Senate" else "lower"

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except ValueError:
                    # This is likely not a vote-count text chunk
                    # It's probably '`On roll call the vote was:`
                    pass
                else:
                    if "yea" in i.lower():
                        yes_count = count
                    elif "nay" in i.lower():
                        no_count = count
                    elif "present" in i.lower():
                        p_count = count
                    elif "absent" in i.lower():
                        a_count = count

            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime("%Y-%m-%d"),
                chamber=vote_chamber,
                motion_text=vote_status,
                result="pass" if yes_count > no_count else "fail",
                classification="passage",
            )
            vote.pupa_id = link

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("abstain", p_count)
            vote.set_count("absent", a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote("yes", re.sub(",", "", a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote("no", re.sub(",", "", a_links[i]).split()[0])
                else:
                    vote.vote("other", re.sub(",", "", a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Ejemplo n.º 57
0
    def scrape_action_page(self, bill, page):
        action_rows = page.xpath('//tbody/tr')
        for row in action_rows:
            action_date = row.xpath('td[1]/text()')[0]
            action_date = datetime.strptime(action_date, '%m/%d/%Y')
            action_year = action_date.year
            action_date = action_date.strftime('%Y-%m-%d')

            if row.xpath('td[2]/text()'):
                action_actor = row.xpath('td[2]/text()')[0]
                action_actor = self.chamber_map_reverse[action_actor.strip()]

            action_name = row.xpath('string(td[3])')

            # House votes
            if "Supplement" in action_name:
                actor = "lower"
                vote_action = action_name.split(' -')[0]
                y = int(action_name.strip().split('-')[1].split('YEAS')[0])
                n = int(action_name.strip().split('YEAS to')[1].split('NAYS')[0])

                # get supplement number
                n_supplement = int(action_name.strip().split('No. ')[1].split(r')')[0])
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result='pass' if y > n else 'fail',
                    classification='passage',
                    bill=bill,
                )
                cached_vote.set_count('yes', y)
                cached_vote.set_count('no', n)

                housevote_pdf = 'http://www.mass.gov/legis/journal/combined{}RCs.pdf'.format(
                    action_year
                )
                # note: 2014-2015 different format and no data on website for years prior to 2014
                self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement)
                cached_vote.add_source(housevote_pdf)

                cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement)

                yield cached_vote

            # Senate votes
            if "Roll Call" in action_name:
                actor = "upper"
                # placeholder
                vote_action = action_name.split(' -')[0]
                try:
                    y, n = re.search('(\d+) yeas .*? (\d+) nays', action_name.lower()).groups()
                    y = int(y)
                    n = int(n)
                except AttributeError:
                    y = int(re.search(r"yeas\s*(\d*)", action_name.lower()).group(1))
                    n = int(re.search(r"nays\s*(\d*)", action_name.lower()).group(1))

                # TODO: other count isn't included, set later
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result='pass' if y > n else 'fail',
                    classification='passage',
                    bill=bill,
                )
                cached_vote.set_count('yes', y)
                cached_vote.set_count('no', n)

                rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)')
                self.scrape_senate_vote(cached_vote, rollcall_pdf)
                cached_vote.add_source(rollcall_pdf)
                yield cached_vote

            attrs = self.categorizer.categorize(action_name)
            action = bill.add_action(
                action_name.strip(),
                action_date,
                chamber=action_actor,
                classification=attrs['classification'],
            )
            for com in attrs.get('committees', []):
                action.add_related_entity(com, entity_type='organization')
Ejemplo n.º 58
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type="text").decode()
        lines = text.splitlines()

        if "Senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        date_string = lines[0].split("Calendar Date:")[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if "Yeas" in line and "Nays" in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ["yes", "no", "not voting", "excused", "absent"]

        if page_index:

            counts = re.split(r"\s{2,}", lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(" ", 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ["Consent Calendar" in line for line in lines[:page_index]]
        )
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip())
            assert (
                consent_calendar_bills
            ), "Could not find bills for consent calendar vote"

        motion_keywords = [
            "favorable",
            "reading",
            "amendment",
            "motion",
            "introduced",
            "bill pass",
            "committee",
        ]
        motion_lines = [
            3,
            2,
            4,
            5,
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                break
            motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0]
        else:
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = "{}#{}".format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            "Voting Nay",
            "Not Voting",
            "COPY",
            "Excused",
            "indicates vote change",
            "Indicates Vote Change",
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or "Voting Yea" in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line for show_stopper in show_stoppers):
                page_index += 1
                vote_index = vote_index + 1
                continue

            names = re.split(r"\s{2,}", current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Ejemplo n.º 59
0
    def scrape_vote(self, bill, action_text, url):
        doc = lxml.html.fromstring(self.get(url).text)

        # process action_text - might look like "Vote - Senate Floor -
        # Third Reading Passed (46-0) - 01/16/12"
        if action_text.startswith('Vote - Senate Floor - '):
            action_text = action_text[22:]
            chamber = 'upper'
        elif action_text.startswith('Vote - House Floor - '):
            action_text = action_text[21:]
            chamber = 'lower'

        motion, unused_date = action_text.rsplit(' - ', 1)
        try:
            yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0]
            yes_count = int(yes_count)
            no_count = int(no_count)
        except IndexError:
            self.info("Motion text didn't contain vote totals, will get them from elsewhere")
            yes_count = None
            no_count = None

        if 'Passed' in motion:
            motion = motion.split(' Passed')[0]
            passed = True
        elif 'Adopted' in motion:
            motion = motion.split(' Adopted')[0]
            passed = True
        elif 'Rejected' in motion:
            motion = motion.split(' Rejected')[0]
            passed = False
        elif 'Failed' in motion:
            motion = motion.split(' Failed')[0]
            passed = False
        elif 'Concur' in motion:
            passed = True
        elif 'Floor Amendment' in motion:
            if yes_count and no_count:
                passed = yes_count > no_count
            else:
                passed = None
        elif 'overridden' in motion.lower():
            passed = True
            motion = 'Veto Override'
        elif 'Sustained' in motion:
            passed = False
            motion = 'Veto Override'
        else:
            raise Exception('unknown motion: %s' % motion)
        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=None,
            motion_text=motion,
            classification='passage',
            result='pass' if passed else 'fail',
        )
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vfunc = None

        nobrs = doc.xpath('//nobr/text()')
        for text in nobrs:
            text = text.replace(u'\xa0', ' ')
            if text.startswith('Calendar Date: '):
                if vote.start_date:
                    self.warning('two dates!, skipping rest of bill')
                    break
                vote.start_date = datetime.datetime.strptime(
                    text.split(': ', 1)[1], '%b %d, %Y %H:%M %p'
                ).strftime('%Y-%m-%d')
            elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text:
                yeas, nays, nv, exc, absent = re.match(
                    (
                        '(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused '
                        '\(Absent\)\s+(\d+) Absent'
                    ), text).groups()
                vote.set_count('yes', int(yeas))
                vote.set_count('no', int(nays))
                vote.set_count('other', int(nv) + int(exc) + int(absent))
            elif 'Voting Yea' in text:
                vfunc = 'yes'
            elif 'Voting Nay' in text:
                vfunc = 'no'
            elif 'Not Voting' in text or 'Excused' in text:
                vfunc = 'other'
            elif vfunc:
                if ' and ' in text:
                    legs = text.split(' and ')
                else:
                    legs = [text]
                for leg in legs:
                    # Strip the occasional asterisk - see #1512
                    leg = leg.rstrip('*')
                    vote.vote(vfunc, leg)

        vote.add_source(url)
        vote.pupa_id = url      # contains vote sequence number
        yield vote