Esempio n. 1
0
    def scrape_bills(self, chamber, year):
        if year != "2009":
            raise NoDataForYear

        if chamber == "upper":
            other_chamber = "lower"
            bill_id = "SB 1"
        else:
            other_chamber = "upper"
            bill_id = "HB 1"

        b1 = Bill("2009-2010", chamber, bill_id, "A super bill")
        b1.add_source("http://example.com")
        b1.add_version("As Introduced", "http://example.com/SB1.html")
        b1.add_document("Google", "http://google.com")
        b1.add_sponsor("primary", "Bob Smith")
        b1.add_sponsor("secondary", "Johnson, Sally")

        d1 = datetime.datetime.strptime("1/29/2010", "%m/%d/%Y")
        v1 = Vote("upper", d1, "Final passage", True, 2, 0, 0)
        v1.yes("Bob Smith")
        v1.yes("Sally Johnson")

        d2 = datetime.datetime.strptime("1/30/2010", "%m/%d/%Y")
        v2 = Vote("lower", d2, "Final passage", False, 0, 1, 1)
        v2.no("B. Smith")
        v2.other("Sally Johnson")

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, "introduced", d1)
        b1.add_action(chamber, "read first time", d1)
        b1.add_action(other_chamber, "introduced", d2)

        self.save_bill(b1)
Esempio n. 2
0
    def scrape_session(self, chamber, year):
        if chamber == "upper":
            bill_abbr = "SB|SCR|SJR"
        elif chamber == "lower":
            bill_abbr = "HB|HCR|HJR"

        # Sessions last 2 years, 1993-1994 was the 18th
        session = str(18 + ((int(year) - 1993) / 2))
        year2 = str(int(year) + 1)

        # Full calendar year
        date1 = "0101" + year[2:]
        date2 = "1231" + year2[2:]

        # Get bill list
        bill_list_url = "http://www.legis.state.ak.us/" "basis/range_multi.asp?session=%s&date1=%s&date2=%s" % (
            session,
            date1,
            date2,
        )
        self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(" ", "")
            bill_name = link.parent.parent.findNext("td").find("font").contents[0].strip()
            bill = Bill(session, chamber, bill_id, bill_name.strip())

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link["href"]
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(" (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})", spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(",")
                bill.add_sponsor("primary", sponsors[0].strip())

                for sponsor in sponsors[1:]:
                    bill.add_sponsor("cosponsor", sponsor.strip())
            else:
                # Committee sponsorship
                bill.add_sponsor("committee", spons_str.strip())

            # Get actions
            act_rows = info_page.findAll("table", "myth")[1].findAll("tr")[1:]
            for row in act_rows:
                cols = row.findAll("td")
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, "%m/%d/%y")

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a["href"])
                    bill.add_vote(vote)

                bill.add_action(act_chamber, action, act_date)

            # Get subjects
            bill["subjects"] = []
            subject_link_re = re.compile(".*subject=\w+$")
            for subject_link in info_page.findAll("a", href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill["subjects"].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session,
                bill_id,
            )
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile("^get_bill_text?")
            for text_link in text_list.findAll("a", href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (text_link["href"])

                bill.add_version(text_name, text_url)

            self.add_bill(bill)
Esempio n. 3
0
    def scrape_session(self, chamber, year):
        if chamber == 'upper':
            bill_abbr = 'SB|SCR|SJR'
        elif chamber == 'lower':
            bill_abbr = 'HB|HCR|HJR'

        # Sessions last 2 years, 1993-1994 was the 18th
        session = str(18 + ((int(year) - 1993) / 2))
        year2 = str(int(year) + 1)

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year2[2:]

        # Get bill list
        bill_list_url = 'http://www.legis.state.ak.us/'\
            'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % (
            session, date1, date2)
        self.log("Getting bill list for %s %s (this may take a long time)." %
                 (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(' ', '')
            bill_name = link.parent.parent.findNext('td').find(
                'font').contents[0].strip()
            bill = Bill(session, chamber, bill_id, bill_name.strip())

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link['href']
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(
                text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(
                ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
                spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(',')
                bill.add_sponsor('primary', sponsors[0].strip())

                for sponsor in sponsors[1:]:
                    bill.add_sponsor('cosponsor', sponsor.strip())
            else:
                # Committee sponsorship
                bill.add_sponsor('committee', spons_str.strip())

            # Get actions
            act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:]
            for row in act_rows:
                cols = row.findAll('td')
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, '%m/%d/%y')

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    try:
                        vote = self.parse_vote(bill, action,
                                               act_chamber, act_date,
                                               cols[1].a['href'])
                        bill.add_vote(vote)
                    except:
                        self.log("Failed parsing vote at %s" %
                                 cols[1].a['href'])

                bill.add_action(act_chamber, action, act_date)

            # Get subjects
            bill['subjects'] = []
            subject_link_re = re.compile('.*subject=\w+$')
            for subject_link in info_page.findAll('a', href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill['subjects'].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session, bill_id)
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile('^get_bill_text?')
            for text_link in text_list.findAll('a', href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (
                    text_link['href'])

                bill.add_version(text_name, text_url)

            self.save_bill(bill)
Esempio n. 4
0
    def scrape_bill(self, chamber, current_bill, session):
        other_chamber = 'upper' if chamber == 'lower' else 'lower'
        with self.soup_context("http://alisondb.legislature.state.al.us/acas/SESSBillsStatusResultsMac.asp?BillNumber=%s&GetStatus=Get+Status&session=%s" % (current_bill, session[0])) as bill:
             if "Your ACAS Session has expired." in str(bill):
                 raise Exception("Expired cookie - you'll have to run with -n to skip caching")
             try:
                 bill_id = int(re.findall(r'BTN([0-9]+)', str(bill))[0])
             except:
                 raise Exception("No bill found. Hopefully that means it's the end of the session") 
             title = bill.find("td", {'colspan': '7'}).string
             self.log("Starting parse of %s" % current_bill)
             #create our bill!
             bill = Bill(session[1], chamber, current_bill, title.strip())

             #add sponsors and co-sponsors
             with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONSponsorsResultsMac.asp?OID=%d" % bill_id) as sponsors:
                 # This pains me.
                 (primary,secondary) = sponsors.findAll("table", text="Co-Sponsors")[0].parent.parent.parent.findAll('table')
                 for p in primary.findAll('td'):
                     bill.add_sponsor('primary', p.string)
                 for s in secondary.findAll('td'):
                     bill.add_sponsor('cosponsor', s.string)
             with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONHistoryResultsMac.asp?OID=%d" % bill_id) as history:
                  actions = history.findAll('table', text="Committee")[0].parent.parent.parent.findAll('tr')
                  #Date Amend/Subst Matter Committee Nay Yea Abs Vote
                  for event in actions:
                       e = event.findAll('td')
                       if len(e) == 0:
                           continue
                       date = e[0].string
                       amend = e[1].find('input')
                       matter = e[2].string
                       y_votes = e[5].string
                       n_votes = e[4].string
                       a_votes = e[6].string

                       if not matter:
                           continue

                       roll = e[7].find('input')
                       #(date, amend, matter, committee, nays, yeas, abs, vote_thing) = map(lambda x: x.string, e)
                       if date != None:
                           act_date = dt.datetime.strptime(date, '%m/%d/%Y')
                       if amend != None:
                           splitter = re.findall(r'documentSelected\(\'(\w*)\',\'([\w\d-]*)\',\'([\w\.\-]*)\',\'([\w\d/]*)\',\'([\w\d]*)\',\'([\w\s]*)\'', str(amend))[0]
                           amend = "http://alisondb.legislature.state.al.us/acas/%s/%s" % (splitter[3], splitter[2])
                           bill.add_document(matter, amend)

                       if roll != None: 
                          splitter = re.findall(r'voteSelected\(\'(\d*)\',\'(\d*)\',\'(\d*)\',\'(.*)\',\'(\d*)\'',str(roll))[0]
                          roll = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&SESS=%s" % (splitter[0], splitter[1], splitter[2], splitter[4])
                          with self.soup_context(roll) as votes:
                              vote_rows = votes.findAll('table', text='Member')[0].parent.parent.parent.findAll('tr')
                              
                              yea_votes = int(votes.findAll('tr', text='Total Yea:')[0].parent.parent.findAll('td')[2].string)
                              nay_votes = int(votes.findAll('tr', text='Total Nay:')[0].parent.parent.findAll('td')[2].string)
                              abs_votes = int(votes.findAll('tr', text='Total Abs:')[0].parent.parent.findAll('td')[2].string)
                              p_votes   = len(votes.findAll('tr', text='P'))
                              
                              #chamber, date, motion, passed, yes_count, no_count, other_count
                              vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes + p_votes)
                              
                              vote.add_source(roll)
                              for row in vote_rows:
                                  skip = str(row)
                                  if "Total Yea" in skip or "Total Nay" in skip or "Total Abs" in skip:
                                      continue
                                  html_layouts_are_awesome = row.findAll('td')
                                  if len(html_layouts_are_awesome) == 0:
                                      continue
	
                                  (name, t) = html_layouts_are_awesome[0].string, html_layouts_are_awesome[2].string
                                  self.dumb_vote(vote, name, t)
                                  
                                  if len(html_layouts_are_awesome) > 3:
                                      (name, t) = html_layouts_are_awesome[4].string, html_layouts_are_awesome[6].string
                                      self.dumb_vote(vote, name, t)
                              bill.add_vote(vote)

                       if y_votes != None:
                           yea_votes = self.dumber_vote(y_votes)
                           nay_votes = self.dumber_vote(n_votes)
                           abs_votes = self.dumber_vote(a_votes)
                           vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes)
                           bill.add_vote(vote)
                       
                       bill.add_action(chamber, matter, act_date)
             self.add_bill(bill)
Esempio n. 5
0
    def scrape_new_session(self, chamber, session):
        """
        Scrapes SD's bill data from 2009 on.
        """

        if chamber == 'upper':
            bill_abbr = 'SB'
        elif chamber == 'lower':
            bill_abbr = 'HB'

        # Get bill list page
        session_url = 'http://legis.state.sd.us/sessions/%s/' % session
        bill_list_url = session_url + 'BillList.aspx'
        self.log('Getting bill list for %s %s' % (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Format of bill link contents
        bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr)
        date_re = re.compile('\d{2}/\d{2}/\d{4}')

        for bill_link in bill_list.findAll('a'):
            if len(bill_link.contents) == 0:
                # Empty link
                continue

            #print bill_link.contents[0]
            bill_match = bill_re.search(bill_link.contents[0])
            if not bill_match:
                continue

            # Parse bill ID and name
            bill_id = bill_link.contents[0].replace(u'\xa0', ' ')
            bill_name = bill_link.findNext().contents[0]

            # Download history page
            hist_url = session_url + bill_link['href']
            history = self.soup_parser(self.urlopen(hist_url))

            bill = Bill(session, chamber, bill_id, bill_name)
            bill.add_source(hist_url)

            # Get all bill versions
            text_table = history.findAll('table')[1]
            for row in text_table.findAll('tr')[2:]:
                #version_date = row.find('td').string
                version_path = row.findAll('td')[1].a['href']
                version_url = "http://legis.state.sd.us/sessions/%s/%s" % (
                    session, version_path)

                version_name = row.findAll('td')[1].a.contents[0].strip()

                bill.add_version(version_name, version_url)

            # Get actions
            act_table = history.find('table')
            for act_row in act_table.findAll('tr')[6:]:
                if act_row.find(text='Action'):
                    continue

                # Get the date (if can't find one then this isn't an action)
                date_match = date_re.match(act_row.td.a.contents[0])
                if not date_match:
                    continue
                act_date = date_match.group(0)
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

                # Get the action string
                action = ""
                for node in act_row.findAll('td')[1].contents:
                    if hasattr(node, 'contents'):
                        action += node.contents[0]

                        if node.contents[0].startswith('YEAS'):
                            # This is a vote!
                            vote_url = "http://legis.state.sd.us/sessions/"\
                                "%s/%s" % (session, node['href'])

                            vote = self.scrape_new_vote(vote_url)
                            vote['date'] = act_date
                            bill.add_vote(vote)
                    else:
                        action += node
                action = action.strip()

                # Add action
                bill.add_action(chamber, action, act_date)

            self.save_bill(bill)
Esempio n. 6
0
    def scrape_old_session(self, chamber, session):
        """
        Scrape SD's bill data from 1997 through 2008.
        """

        if chamber == 'upper':
            bill_abbr = 'SB'
        else:
            bill_abbr = 'HB'

        # Get bill list page (and replace malformed tags that some versions of
        # BeautifulSoup choke on)
        session_url = 'http://legis.state.sd.us/sessions/%s/' % session
        bill_list_url = session_url + 'billlist.htm'
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Bill and text link formats
        bill_re = re.compile('%s (\d+)' % bill_abbr)
        text_re = re.compile('/sessions/%s/bills/%s.*\.htm' % (
                session, bill_abbr), re.IGNORECASE)
        date_re = re.compile('\d{2}/\d{2}/\d{4}')

        for bill_link in bill_list.findAll('a', href=re.compile('\d\.htm$')):
            if len(bill_link.contents) == 0:
                # Empty link
                continue

            bill_match = bill_re.match(bill_link.contents[0])
            if not bill_match:
                # Not bill link
                continue

            # Get the bill ID and name
            bill_id = bill_link.contents[0]
            bill_name = bill_link.findNext().contents[0]

            # Get history page (replacing malformed tag)
            hist_url = session_url + bill_link['href']
            history = self.soup_parser(self.urlopen(hist_url))

            # Get URL of latest verion of bill (should be listed last)
            bill_url = history.findAll('a', href=text_re)[-1]['href']
            bill_url = 'http://legis.state.sd.us%s' % bill_url

            # Add bill
            bill = Bill(session, chamber, bill_id, bill_name)
            bill.add_source(hist_url)

            # Get bill versions
            text_table = history.findAll('table')[1]
            for row in text_table.findAll('tr')[2:]:
                #version_date = row.find('td').string
                version_path = row.findAll('td')[1].a['href']
                version_url = "http://legis.state.sd.us" + version_path

                version_name = row.findAll('td')[1].a.contents[0].strip()

                bill.add_version(version_name, version_url)

            # Get actions
            act_table = history.find('table')
            for act_row in act_table.findAll('tr')[6:]:
                if act_row.find(text="Action"):
                    continue

                # Get the date (if can't find one then this isn't an action)
                date_match = date_re.match(act_row.td.a.contents[0])
                if not date_match:
                    continue
                act_date = date_match.group(0)
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

                # Get the action string
                action = ""
                for node in act_row.findAll('td')[1].contents:
                    if hasattr(node, 'contents'):
                        action += node.contents[0]

                        if node.contents[0].startswith('YEAS'):
                            # This is a vote!
                            if node['href'][0] == '/':
                                vote_url = "http://legis.state.sd.us/%s" % (
                                    node['href'])
                            else:
                                vote_url = "http://legis.state.sd.us/"\
                                    "sessions/%s/%s" % (session, node['href'])

                            vote = self.scrape_old_vote(vote_url)
                            vote['date'] = act_date
                            bill.add_vote(vote)
                    else:
                        action += node
                action = action.strip()

                # Add action
                bill.add_action(chamber, action, act_date)

            self.save_bill(bill)
Esempio n. 7
0
    def scrape_bills(self, chamber, year):
        session = "%s%d" % (year, int(year) + 1)
        if not session in self.metadata['sessions']:
            raise NoDataForYear(year)

        if chamber == 'upper':
            measure_abbr = 'SB'
            chamber_name = 'SENATE'
            house_type = 'S'
        else:
            measure_abbr = 'AB'
            chamber_name = 'ASSEMBLY'
            house_type = 'A'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=measure_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id
            version = self.session.query(CABillVersion).filter_by(
                bill=bill).filter(CABillVersion.bill_xml != None).first()
            if not version:
                # not enough data to import
                continue

            fsbill = Bill(bill_session, chamber, bill_id,
                          version.title,
                          short_title=version.short_title)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                fsbill.add_action(actor, action.action, action.action_date)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    vote_chamber = ''
                    vote_location = full_loc

                fsvote = Vote(vote_chamber,
                              vote.vote_date_time,
                              vote.motion.motion_text or '',
                              result,
                              vote.ayes, vote.noes, vote.abstain,
                              threshold=vote.threshold,
                              location=vote_location)

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                fsbill.add_vote(fsvote)

            self.add_bill(fsbill)