Example #1
0
    def scrape_bill(self, chamber, current_bill, session):
        other_chamber = "upper" if chamber == "lower" else "lower"
        with self.soup_context(
            "http://alisondb.legislature.state.al.us/acas/SESSBillsStatusResultsMac.asp?BillNumber=%s&GetStatus=Get+Status&session=%s"
            % (current_bill, session[0])
        ) as bill:
            if "Your ACAS Session has expired." in str(bill):
                raise Exception("Expired cookie - you'll have to run with -n to skip caching")
            try:
                bill_id = int(re.findall(r"BTN([0-9]+)", str(bill))[0])
            except:
                raise Exception("No bill found. Hopefully that means it's the end of the session")
            title = bill.find("td", {"colspan": "7"}).string
            self.log("Starting parse of %s" % current_bill)
            # create our bill!
            bill = Bill(session[1], chamber, current_bill, title.strip())

            # add sponsors and co-sponsors
            with self.soup_context(
                "http://alisondb.legislature.state.al.us/acas/ACTIONSponsorsResultsMac.asp?OID=%d" % bill_id
            ) as sponsors:
                # This pains me.
                (primary, secondary) = sponsors.findAll("table", text="Co-Sponsors")[0].parent.parent.parent.findAll(
                    "table"
                )
                for p in primary.findAll("td"):
                    bill.add_sponsor("primary", p.string)
                for s in secondary.findAll("td"):
                    bill.add_sponsor("cosponsor", s.string)
            with self.soup_context(
                "http://alisondb.legislature.state.al.us/acas/ACTIONHistoryResultsMac.asp?OID=%d" % bill_id
            ) as history:
                actions = history.findAll("table", text="Committee")[0].parent.parent.parent.findAll("tr")
                # Date Amend/Subst Matter Committee Nay Yea Abs Vote
                for event in actions:
                    e = event.findAll("td")
                    if len(e) == 0:
                        continue
                    date = e[0].string
                    amend = e[1].find("input")
                    matter = e[2].string
                    y_votes = e[5].string
                    n_votes = e[4].string
                    a_votes = e[6].string

                    if not matter:
                        continue

                    roll = e[7].find("input")
                    # (date, amend, matter, committee, nays, yeas, abs, vote_thing) = map(lambda x: x.string, e)
                    if date != None:
                        act_date = dt.datetime.strptime(date, "%m/%d/%Y")
                    if amend != None:
                        splitter = re.findall(
                            r"documentSelected\(\'(\w*)\',\'([\w\d-]*)\',\'([\w\.\-]*)\',\'([\w\d/]*)\',\'([\w\d]*)\',\'([\w\s]*)\'",
                            str(amend),
                        )[0]
                        amend = "http://alisondb.legislature.state.al.us/acas/%s/%s" % (splitter[3], splitter[2])
                        bill.add_document(matter, amend)

                    if roll != None:
                        splitter = re.findall(
                            r"voteSelected\(\'(\d*)\',\'(\d*)\',\'(\d*)\',\'(.*)\',\'(\d*)\'", str(roll)
                        )[0]
                        roll = (
                            "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&SESS=%s"
                            % (splitter[0], splitter[1], splitter[2], splitter[4])
                        )
                        with self.soup_context(roll) as votes:
                            vote_rows = votes.findAll("table", text="Member")[0].parent.parent.parent.findAll("tr")

                            yea_votes = int(
                                votes.findAll("tr", text="Total Yea:")[0].parent.parent.findAll("td")[2].string
                            )
                            nay_votes = int(
                                votes.findAll("tr", text="Total Nay:")[0].parent.parent.findAll("td")[2].string
                            )
                            abs_votes = int(
                                votes.findAll("tr", text="Total Abs:")[0].parent.parent.findAll("td")[2].string
                            )
                            p_votes = len(votes.findAll("tr", text="P"))

                            # chamber, date, motion, passed, yes_count, no_count, other_count
                            vote = Vote(
                                chamber,
                                act_date,
                                matter,
                                (yea_votes > nay_votes),
                                yea_votes,
                                nay_votes,
                                abs_votes + p_votes,
                            )

                            vote.add_source(roll)
                            for row in vote_rows:
                                skip = str(row)
                                if "Total Yea" in skip or "Total Nay" in skip or "Total Abs" in skip:
                                    continue
                                html_layouts_are_awesome = row.findAll("td")
                                if len(html_layouts_are_awesome) == 0:
                                    continue

                                (name, t) = html_layouts_are_awesome[0].string, html_layouts_are_awesome[2].string
                                self.dumb_vote(vote, name, t)

                                if len(html_layouts_are_awesome) > 3:
                                    (name, t) = html_layouts_are_awesome[4].string, html_layouts_are_awesome[6].string
                                    self.dumb_vote(vote, name, t)
                            bill.add_vote(vote)

                    if y_votes != None:
                        yea_votes = self.dumber_vote(y_votes)
                        nay_votes = self.dumber_vote(n_votes)
                        abs_votes = self.dumber_vote(a_votes)
                        vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes)
                        bill.add_vote(vote)

                    bill.add_action(chamber, matter, act_date)
            self.save_bill(bill)
    def scrape_new_session(self, chamber, session):
        """
        Scrapes SD's bill data from 2009 on.
        """

        if chamber == 'upper':
            bill_abbr = 'SB'
        elif chamber == 'lower':
            bill_abbr = 'HB'

        # Get bill list page
        session_url = 'http://legis.state.sd.us/sessions/%s/' % session
        bill_list_url = session_url + 'BillList.aspx'
        self.log('Getting bill list for %s %s' % (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Format of bill link contents
        bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr)
        date_re = re.compile('\d{2}/\d{2}/\d{4}')

        for bill_link in bill_list.findAll('a'):
            if len(bill_link.contents) == 0:
                # Empty link
                continue

            #print bill_link.contents[0]
            bill_match = bill_re.search(bill_link.contents[0])
            if not bill_match:
                continue

            # Parse bill ID and name
            bill_id = bill_link.contents[0].replace(u'\xa0', ' ')
            bill_name = bill_link.findNext().contents[0]

            # Download history page
            hist_url = session_url + bill_link['href']
            history = self.soup_parser(self.urlopen(hist_url))

            bill = Bill(session, chamber, bill_id, bill_name)
            bill.add_source(hist_url)

            # Get all bill versions
            text_table = history.findAll('table')[1]
            for row in text_table.findAll('tr')[2:]:
                #version_date = row.find('td').string
                version_path = row.findAll('td')[1].a['href']
                version_url = "http://legis.state.sd.us/sessions/%s/%s" % (
                    session, version_path)

                version_name = row.findAll('td')[1].a.contents[0].strip()

                bill.add_version(version_name, version_url)

            # Get actions
            act_table = history.find('table')
            for act_row in act_table.findAll('tr')[6:]:
                if act_row.find(text='Action'):
                    continue

                # Get the date (if can't find one then this isn't an action)
                date_match = date_re.match(act_row.td.a.contents[0])
                if not date_match:
                    continue
                act_date = date_match.group(0)
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

                # Get the action string
                action = ""
                for node in act_row.findAll('td')[1].contents:
                    if hasattr(node, 'contents'):
                        action += node.contents[0]

                        if node.contents[0].startswith('YEAS'):
                            # This is a vote!
                            vote_url = "http://legis.state.sd.us/sessions/"\
                                "%s/%s" % (session, node['href'])

                            vote = self.scrape_new_vote(vote_url)
                            vote['date'] = act_date
                            bill.add_vote(vote)
                    else:
                        action += node
                action = action.strip()

                # Add action
                bill.add_action(chamber, action, act_date)

            self.save_bill(bill)
Example #3
0
    def scrape_bills(self,chamber,year):
        if int(year) %2 == 0:  
            raise NoDataForPeriod(year)
        # 
        year = int(year)
        oyear = year #save off the original of the session
        if chamber == 'upper':
            bill_no = 1
            abbr = 'SB'
        else:
            bill_no = 4001
            abbr = 'HB'
        while True:
            (bill_page,year) = self.scrape_bill(year, abbr, bill_no)
            # if we can't find a page, we must be done. This is a healthy thing.
            if bill_page == None: return
            title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0]))
            title = title.replace('\n','').replace('\r','')
            bill_id = "%s %d" % (abbr, bill_no)

            the_bill = Bill("%d" % oyear, chamber, bill_id, title)

            #sponsors
            first = 0
            for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'):
                the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string)
                first = 1

            #versions
            for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'):
                r = self.parse_doc(the_bill, doc)
                if r: the_bill.add_version(*r)

            #documents
            if 'frg_billstatus_HlaTable' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)
            if 'frg_billstatus_SfaSection' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)

            the_bill.add_source('http://legislature.mi.gov/doc.aspx?%d-%s-%04d' % (year, abbr, bill_no))
            self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0])
            self.save_bill(the_bill)
            bill_no = bill_no + 1
        pass
    def scrape_old_session(self, chamber, session):
        """
        Scrape SD's bill data from 1997 through 2008.
        """

        if chamber == 'upper':
            bill_abbr = 'SB'
        else:
            bill_abbr = 'HB'

        # Get bill list page (and replace malformed tags that some versions of
        # BeautifulSoup choke on)
        session_url = 'http://legis.state.sd.us/sessions/%s/' % session
        bill_list_url = session_url + 'billlist.htm'
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Bill and text link formats
        bill_re = re.compile('%s (\d+)' % bill_abbr)
        text_re = re.compile('/sessions/%s/bills/%s.*\.htm' % (
                session, bill_abbr), re.IGNORECASE)
        date_re = re.compile('\d{2}/\d{2}/\d{4}')

        for bill_link in bill_list.findAll('a', href=re.compile('\d\.htm$')):
            if len(bill_link.contents) == 0:
                # Empty link
                continue

            bill_match = bill_re.match(bill_link.contents[0])
            if not bill_match:
                # Not bill link
                continue

            # Get the bill ID and name
            bill_id = bill_link.contents[0]
            bill_name = bill_link.findNext().contents[0]

            # Get history page (replacing malformed tag)
            hist_url = session_url + bill_link['href']
            history = self.soup_parser(self.urlopen(hist_url))

            # Get URL of latest verion of bill (should be listed last)
            bill_url = history.findAll('a', href=text_re)[-1]['href']
            bill_url = 'http://legis.state.sd.us%s' % bill_url

            # Add bill
            bill = Bill(session, chamber, bill_id, bill_name)
            bill.add_source(hist_url)

            # Get bill versions
            text_table = history.findAll('table')[1]
            for row in text_table.findAll('tr')[2:]:
                #version_date = row.find('td').string
                version_path = row.findAll('td')[1].a['href']
                version_url = "http://legis.state.sd.us" + version_path

                version_name = row.findAll('td')[1].a.contents[0].strip()

                bill.add_version(version_name, version_url)

            # Get actions
            act_table = history.find('table')
            for act_row in act_table.findAll('tr')[6:]:
                if act_row.find(text="Action"):
                    continue

                # Get the date (if can't find one then this isn't an action)
                date_match = date_re.match(act_row.td.a.contents[0])
                if not date_match:
                    continue
                act_date = date_match.group(0)
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

                # Get the action string
                action = ""
                for node in act_row.findAll('td')[1].contents:
                    if hasattr(node, 'contents'):
                        action += node.contents[0]

                        if node.contents[0].startswith('YEAS'):
                            # This is a vote!
                            if node['href'][0] == '/':
                                vote_url = "http://legis.state.sd.us/%s" % (
                                    node['href'])
                            else:
                                vote_url = "http://legis.state.sd.us/"\
                                    "sessions/%s/%s" % (session, node['href'])

                            vote = self.scrape_old_vote(vote_url)
                            vote['date'] = act_date
                            bill.add_vote(vote)
                    else:
                        action += node
                action = action.strip()

                # Add action
                bill.add_action(chamber, action, act_date)

            self.save_bill(bill)