Esempio n. 1
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sb1.htm"
        page = parse(url).getroot()

        # Bill
        try:
            name = page.cssselect("#legislation h1")[0].text_content().strip()
        except:
            name = "Unknown"
        bill = Bill(session, chamberName, number, name)

        # Sponsorships
        for a in page.cssselect("#sponsors a"):
            bill.add_sponsor("", a.text_content().strip())

        self.parse_votes(url, page, chamberName, bill)

        # Actions
        for row in page.cssselect("#history tr")[1:]:
            date = row[0].text_content().strip()
            action_text = row[1].text_content().strip()

            if "/" not in date:
                continue

            if action_text.startswith("Senate"):
                bill.add_action("upper", action_text, date)
            elif action_text.startswith("House"):
                bill.add_action("lower", action_text, date)

        # Versions
        for row in page.cssselect("#versions a"):
            bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href")))

        self.add_bill(bill)
Esempio n. 2
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        page = parse(url).getroot()

        # Bill
        name = page.cssselect('#legislation h1')[0].text_content().strip()
        bill = Bill(session, chamberName, number, name)

        # Sponsorships
        for a in page.cssselect("#sponsors a"):
            bill.add_sponsor('', a.text_content().strip())

        # Actions
        for row in page.cssselect('#history tr')[1:]:
            date = row[0].text_content().strip()
            action_text = row[1].text_content().strip()

            if '/' not in date:
                continue

            if action_text.startswith('Senate'):
                bill.add_action('upper', action_text, date)
            elif action_text.startswith('House'):
                bill.add_action('lower', action_text, date)

        # Versions
        for row in page.cssselect('#versions a'):
            bill.add_version(a.text_content(),
                             urlparse.urljoin(url, a.get('href')))

        self.add_bill(bill)
Esempio n. 3
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        page = parse(url).getroot()

        # Grab the interesting tables on the page.
        tables = page.cssselect("table")

        # Bill
        name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1]
        bill = Bill(session, chamberName, number, name)

        # Versions
        bill.add_version("Current", url.replace("/sum/", "/fulltext/"))

        # Sponsorships
        for a in tables[2].cssselect("a"):
            bill.add_sponsor("", a.text_content().strip())

        self.parse_votes_1999(url, page, chamberName, bill)

        # Actions
        for row in tables[-1].cssselect("tr"):
            senate_date = row[0].text_content().strip()
            action_text = row[1].text_content().strip()
            house_date = row[2].text_content().strip()
            if "/" not in senate_date and "/" not in house_date:
                continue
            if senate_date:
                bill.add_action("upper", action_text, senate_date)
            if house_date:
                bill.add_action("lower", action_text, house_date)

        self.add_bill(bill)
Esempio n. 4
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sb1.htm"
        page = parse(url).getroot()

        # Grab the interesting tables on the page.
        tables = page.cssselect("center table")

        # Bill
        name = tables[0].text_content().split("-", 1)[1]
        bill = Bill(session, chamberName, number, name)

        # Sponsorships
        for a in tables[1].cssselect("a"):
            bill.add_sponsor("", a.text_content().strip())

        self.parse_votes_2001_2004(url, page, chamberName, bill)

        # Actions
        center = page.cssselect("center table center")[0]

        for row in center.cssselect("table")[-2].cssselect("tr")[2:]:
            date = row[0].text_content().strip()
            action_text = row[1].text_content().strip()
            if "/" not in date:
                continue
            if action_text.startswith("Senate"):
                bill.add_action("upper", action_text, date)
            elif action_text.startswith("House"):
                bill.add_action("lower", action_text, date)

        # Versions
        for row in center.cssselect("table")[-1].cssselect("a"):
            bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href")))

        self.add_bill(bill)
Esempio n. 5
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        page = parse(url).getroot()

        # Grab the interesting tables on the page.
        tables = page.cssselect('center table')

        # Bill
        name = tables[0].text_content().split('-', 1)[1]
        bill = Bill(session, chamberName, number, name)

        # Sponsorships
        for a in tables[1].cssselect('a'):
            bill.add_sponsor('', a.text_content().strip())

        # Actions
        center = page.cssselect('center table center')[0]

        for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
            date = row[0].text_content().strip()
            action_text = row[1].text_content().strip()
            if '/' not in date:
                continue
            if action_text.startswith('Senate'):
                bill.add_action('upper', action_text, date)
            elif action_text.startswith('House'):
                bill.add_action('lower', action_text, date)

        # Versions
        for row in center.cssselect('table')[-1].cssselect('a'):
            bill.add_version(a.text_content(),
                             urlparse.urljoin(url, a.get('href')))

        self.add_bill(bill)
Esempio n. 6
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        page = parse(url).getroot()

        # Grab the interesting tables on the page.
        tables = page.cssselect('table')

        # Bill
        name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
        bill = Bill(session, chamberName, number, name)

        # Versions
        bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

        # Sponsorships
        for a in tables[2].cssselect('a'):
            bill.add_sponsor('', a.text_content().strip())

        # Actions
        for row in tables[-1].cssselect('tr'):
            senate_date = row[0].text_content().strip()
            action_text = row[1].text_content().strip()
            house_date = row[2].text_content().strip()
            if '/' not in senate_date and '/' not in house_date:
                continue
            if senate_date:
                bill.add_action('upper', action_text, senate_date)
            if house_date:
                bill.add_action('lower', action_text, house_date)

        self.add_bill(bill)
Esempio n. 7
0
    def get_bill_info(self, chamber, session, bill_detail_url):
	"""Extracts all the requested info for a given bill.  
	
	Calls the parent's methods to enter the results into CSV files.
	"""
        bill_detail_url_base='https://www.revisor.leg.state.mn.us/revisor/pages/search_status/'
        bill_detail_url = urlparse.urljoin(bill_detail_url_base, bill_detail_url)

        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.soup_context(bill_detail_url) as bill_soup:

            bill_id = self.extract_bill_id(bill_soup)
            bill_title =  self.extract_bill_title(bill_soup)
            bill = Bill(session, chamber, bill_id, bill_title)

            # get all versions of the bill.
            # Versions of a bill are on a separate page, linked to from the bill
            # details page in a link titled, "Bill Text".
            version_url_base = 'https://www.revisor.leg.state.mn.us'
            bill_version_link = self.extract_bill_version_link(bill_soup)

        version_detail_url = urlparse.urljoin(version_url_base, bill_version_link)

        with self.soup_context(version_detail_url) as version_soup:

            # MN bills can have multiple versions.  Get them all, and loop over
            # the results, adding each one.
            bill_versions = self.extract_bill_versions(version_soup)
            for version in bill_versions:
                version_name = version['name']
                version_url = urlparse.urljoin(version_url_base, version['url'])
                bill.add_version(version_name, version_url)

            # grab primary and cosponsors 
            # MN uses "Primary Author" to name a bill's primary sponsor.
            # Everyone else listed will be added as a 'cosponsor'.
            sponsors = self.extract_bill_sponsors(bill_soup)
            primary_sponsor = sponsors[0]
            cosponsors = sponsors[1:]
            bill.add_sponsor('primary', primary_sponsor)
            for leg in cosponsors:
                bill.add_sponsor('cosponsor', leg)

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(bill_soup, chamber)
            for action in bill_actions:
                action_chamber = action['action_chamber']
                action_date = action['action_date']
                action_text = action['action_text']
                bill.add_action(action_chamber, action_text, action_date)

        self.add_bill(bill)
Esempio n. 8
0
    def scrape_bill(self, chamber, session, billid, histurl, year):
        if year[0] != 'R':
            session = year
        else:
            session = self.metadata['session_details'][year][
                'sub_sessions'][int(year[0]) - 1]

        with self.urlopen_context(histurl) as data:
            soup = BS(cleansource(data))
            basicinfo = soup.findAll('div', id='bhistleft')[0]
            hist = basicinfo.table

            sponsor = None
            title = None
            for b in basicinfo.findAll('b'):
                if b.next.startswith('SUMMARY'):
                    title = b.findNextSiblings(text=True)[0].strip()
                elif b.next.startswith('SPONSOR'):
                    for a in b.findNextSiblings('a'):
                        if not issponsorlink(a):
                            break
                        sponsor = cleansponsor(a.contents[0])

            bill = Bill(session, chamber, billid, title)

            if sponsor:
                bill.add_sponsor('primary', sponsor)

            for row in hist.findAll('tr'):
                link = row.td.a
                vlink = urlbase % link['href']
                vname = link.contents[0].strip()
                bill.add_version(vname, vlink)

            history = soup.findAll('div', id='bhisttab')[0].table
            rows = history.findAll('tr')[1:]
            for row in rows:
                tds = row.findAll('td')
                if len(tds) < 2:
                    # This is not actually an action
                    continue
                date, action = row.findAll('td')[:2]
                date = dt.datetime.strptime(date.contents[0], '%m/%d/%y')
                action = action.contents[0].strip()
                if 'House' in action:
                    actor = 'lower'
                elif 'Senate' in action:
                    actor = 'upper'
                else:  # for lack of a better
                    actor = chamber

                bill.add_action(actor, action, date)

        self.add_bill(bill)
Esempio n. 9
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen_context(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = '%s/bill.doc' % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext('p').contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version",
                             session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile(
                    'legislator/[SH]\d+\.htm'))

            for sponsor_link in sponsor_links:
                bill.add_sponsor('primary', sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext('p')[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = action.split('-')[0]
                action_date = dt.datetime.strptime(action_date, '%b %d')
                # Fix:
                action_date = action_date.replace(
                    year=int('20' + session[2:4]))

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
            if vote_link:
                bill.add_document(
                    'vote_history.pdf',
                    bill_info_url.replace('.htm', '') + "/vote_history.pdf")

            self.add_bill(bill)
Esempio n. 10
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
	"""Extracts all the requested info for a given bill.

	Calls the parent's methods to enter the results into JSON files.
	"""
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.soup_context(bill_detail_url) as bill_soup:

            bill_id = self.extract_bill_id(bill_soup)
            bill_title =  self.extract_bill_title(bill_soup)
            bill = Bill(session, chamber, bill_id, bill_title)

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.

        with self.soup_context(version_list_url) as version_soup:

            # MN bills can have multiple versions.  Get them all, and loop over
            # the results, adding each one.
            self.debug("Extracting bill versions from: " + version_list_url)
            bill_versions = self.extract_bill_versions(version_soup)
            for version in bill_versions:
                version_name = version['name']
                version_url = urlparse.urljoin(VERSION_URL_BASE, version['url'])
                bill.add_version(version_name, version_url)

            # grab primary and cosponsors
            # MN uses "Primary Author" to name a bill's primary sponsor.
            # Everyone else listed will be added as a 'cosponsor'.
            sponsors = self.extract_bill_sponsors(bill_soup)
            primary_sponsor = sponsors[0]
            cosponsors = sponsors[1:]
            bill.add_sponsor('primary', primary_sponsor)
            for leg in cosponsors:
                bill.add_sponsor('cosponsor', leg)

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(bill_soup, chamber)
            for action in bill_actions:
                action_chamber = action['action_chamber']
                action_date = action['action_date']
                action_text = action['action_text']
                bill.add_action(action_chamber, action_text, action_date)

        self.add_bill(bill)
Esempio n. 11
0
    def scrape1995(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm"
        page = parse(url).getroot()

        # Bill
        name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip()
        bill = Bill(session, chamberName, number, name)

        # Versions
        bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

        # Sponsorships
        rows = page.cssselect('center table tr')
        for row in rows:
            if row.text_content().strip() == 'Sponsor and CoSponsors':
                continue
            if row.text_content().strip() == 'Links / Committees / Status':
                break
            for a in row.cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

        # Actions
        # The actions are in a pre table that looks like:
        """    SENATE                         HOUSE
               -------------------------------------
             1/13/95   Read 1st time          2/6/95
             1/31/95   Favorably Reported
             2/1/95    Read 2nd Time          2/7/95
             2/3/95    Read 3rd Time
             2/3/95    Passed/Adopted                   """

        actions = page.cssselect('pre')[0].text_content().split('\n')
        actions = actions[2:]
        for action in actions:
            senate_date = action[:22].strip()
            action_text = action[23:46].strip()
            house_date = action[46:].strip()

            if '/' not in senate_date and '/' not in house_date:
                continue

            if senate_date:
                bill.add_action('upper', action_text, senate_date)

            if house_date:
                bill.add_action('lower', action_text, house_date)

        self.add_bill(bill)
Esempio n. 12
0
    def scrape1995(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm"
        page = parse(url).getroot()

        # Bill
        name = page.cssselect("h3 br")[0].tail.split("-", 1)[1].strip()
        bill = Bill(session, chamberName, number, name)

        # Versions
        bill.add_version("Current", url.replace("/sum/", "/fulltext/"))

        # Sponsorships
        rows = page.cssselect("center table tr")
        for row in rows:
            if row.text_content().strip() == "Sponsor and CoSponsors":
                continue
            if row.text_content().strip() == "Links / Committees / Status":
                break
            for a in row.cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

        # Actions
        # The actions are in a pre table that looks like:
        """    SENATE                         HOUSE
               -------------------------------------
             1/13/95   Read 1st time          2/6/95
             1/31/95   Favorably Reported
             2/1/95    Read 2nd Time          2/7/95
             2/3/95    Read 3rd Time
             2/3/95    Passed/Adopted                   """

        actions = page.cssselect("pre")[0].text_content().split("\n")
        actions = actions[2:]
        for action in actions:
            senate_date = action[:22].strip()
            action_text = action[23:46].strip()
            house_date = action[46:].strip()

            if "/" not in senate_date and "/" not in house_date:
                continue

            if senate_date:
                bill.add_action("upper", action_text, senate_date)

            if house_date:
                bill.add_action("lower", action_text, house_date)

        self.add_bill(bill)
Esempio n. 13
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen_context(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = "%s/bill.doc" % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext("p").contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version", session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile("legislator/[SH]\d+\.htm"))

            for sponsor_link in sponsor_links:
                bill.add_sponsor("primary", sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext("p")[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if not action or action == "last action" or "Prefiled" in action:
                    continue

                action_date = action.split("-")[0]
                action_date = dt.datetime.strptime(action_date, "%b %d")
                # Fix:
                action_date = action_date.replace(year=int("20" + session[2:4]))

                action = "-".join(action.split("-")[1:])

                if action.endswith("House") or action.endswith("(H)"):
                    actor = "lower"
                elif action.endswith("Senate") or action.endswith("(S)"):
                    actor = "upper"
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile(".*/vote_history.pdf"))
            if vote_link:
                bill.add_document("vote_history.pdf", bill_info_url.replace(".htm", "") + "/vote_history.pdf")

            self.save_bill(bill)
Esempio n. 14
0
    def parse_bill_xml(self, chamber, session, txt):
        root = lxml.etree.fromstring(txt)
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])
        bill_title = root.findtext("caption")

        if session[2] == 'R':
            session = session[0:2]

        bill = Bill(session, chamber, bill_id, bill_title)

        for action in root.findall('actions/action'):
            act_date = dt.datetime.strptime(action.findtext('date'),
                                            "%m/%d/%Y")

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {'H': 'lower',
                     'S': 'upper',
                     'E': 'executive'}[extra['action_number'][0]]

            bill.add_action(actor, action.findtext('description'),
                            act_date, **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('author', author)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('coauthor', coauthor)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('sponsor', sponsor)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor', cosponsor)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        return bill
Esempio n. 15
0
def parse_bill(scraper, url):
    """Given a bill status URL, return a fully loaded Bill object, except for votes, which
       are expected to be handled externally.
    """
    session = extract_session(url)
    chamber = chamber_for_doctype(extract_doctype(url))
    s = get_soup(scraper, url)
    bill_id = extract_bill_id(s)
    landmark = s(text=re.compile(".*Short Description.*"))
    name_span = landmark[0].findParent().findNextSibling()
    bill_name = get_text(name_span)
    bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url)
    actions = extract_actions(s)
    for chamber,action,date in actions:
        bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em.  
    sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions])
    for type,namelist in sponsor_dict.iteritems():
        for name in namelist:
            bill.add_sponsor(type,name)
    for name,link in extract_versions(scraper, s):
        bill.add_version(name,link)
    return bill
Esempio n. 16
0
    def scrape1997(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1997_98/leg/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = []
            for table in page.cssselect('center table'):
                if table.get('border') == '5':
                    tables.append(table)

            # Bill
            name = page.cssselect('tr > td > font > b')[0].text_content().split(
                '-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[0].cssselect('a'):
                if a.text_content().strip() == 'Current':
                    break
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Esempio n. 17
0
    def scrape_bills(self, chamber, year):
        if year != "2009":
            raise NoDataForYear

        if chamber == "upper":
            other_chamber = "lower"
            bill_id = "SB 1"
        else:
            other_chamber = "upper"
            bill_id = "HB 1"

        b1 = Bill("2009-2010", chamber, bill_id, "A super bill")
        b1.add_source("http://example.com")
        b1.add_version("As Introduced", "http://example.com/SB1.html")
        b1.add_document("Google", "http://google.com")
        b1.add_sponsor("primary", "Bob Smith")
        b1.add_sponsor("secondary", "Johnson, Sally")

        d1 = datetime.datetime.strptime("1/29/2010", "%m/%d/%Y")
        v1 = Vote("upper", d1, "Final passage", True, 2, 0, 0)
        v1.yes("Bob Smith")
        v1.yes("Sally Johnson")

        d2 = datetime.datetime.strptime("1/30/2010", "%m/%d/%Y")
        v2 = Vote("lower", d2, "Final passage", False, 0, 1, 1)
        v2.no("B. Smith")
        v2.other("Sally Johnson")

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, "introduced", d1)
        b1.add_action(chamber, "read first time", d1)
        b1.add_action(other_chamber, "introduced", d2)

        self.save_bill(b1)
Esempio n. 18
0
    def scrape_session(self, chamber, year):
        if chamber == "upper":
            bill_abbr = "SB|SCR|SJR"
        elif chamber == "lower":
            bill_abbr = "HB|HCR|HJR"

        # Sessions last 2 years, 1993-1994 was the 18th
        session = str(18 + ((int(year) - 1993) / 2))
        year2 = str(int(year) + 1)

        # Full calendar year
        date1 = "0101" + year[2:]
        date2 = "1231" + year2[2:]

        # Get bill list
        bill_list_url = "http://www.legis.state.ak.us/" "basis/range_multi.asp?session=%s&date1=%s&date2=%s" % (
            session,
            date1,
            date2,
        )
        self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(" ", "")
            bill_name = link.parent.parent.findNext("td").find("font").contents[0].strip()
            bill = Bill(session, chamber, bill_id, bill_name.strip())

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link["href"]
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(" (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})", spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(",")
                bill.add_sponsor("primary", sponsors[0].strip())

                for sponsor in sponsors[1:]:
                    bill.add_sponsor("cosponsor", sponsor.strip())
            else:
                # Committee sponsorship
                bill.add_sponsor("committee", spons_str.strip())

            # Get actions
            act_rows = info_page.findAll("table", "myth")[1].findAll("tr")[1:]
            for row in act_rows:
                cols = row.findAll("td")
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, "%m/%d/%y")

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a["href"])
                    bill.add_vote(vote)

                bill.add_action(act_chamber, action, act_date)

            # Get subjects
            bill["subjects"] = []
            subject_link_re = re.compile(".*subject=\w+$")
            for subject_link in info_page.findAll("a", href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill["subjects"].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session,
                bill_id,
            )
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile("^get_bill_text?")
            for text_link in text_list.findAll("a", href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (text_link["href"])

                bill.add_version(text_name, text_url)

            self.add_bill(bill)
Esempio n. 19
0
    def scrape_session(self, chamber, year):
        if chamber == 'upper':
            bill_abbr = 'SB|SCR|SJR'
        elif chamber == 'lower':
            bill_abbr = 'HB|HCR|HJR'

        # Sessions last 2 years, 1993-1994 was the 18th
        session = str(18 + ((int(year) - 1993) / 2))
        year2 = str(int(year) + 1)

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year2[2:]

        # Get bill list
        bill_list_url = 'http://www.legis.state.ak.us/'\
            'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % (
            session, date1, date2)
        self.log("Getting bill list for %s %s (this may take a long time)." %
                 (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(' ', '')
            bill_name = link.parent.parent.findNext('td').find(
                'font').contents[0].strip()
            bill = Bill(session, chamber, bill_id, bill_name.strip())

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link['href']
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(
                text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(
                ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
                spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(',')
                bill.add_sponsor('primary', sponsors[0].strip())

                for sponsor in sponsors[1:]:
                    bill.add_sponsor('cosponsor', sponsor.strip())
            else:
                # Committee sponsorship
                bill.add_sponsor('committee', spons_str.strip())

            # Get actions
            act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:]
            for row in act_rows:
                cols = row.findAll('td')
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, '%m/%d/%y')

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    try:
                        vote = self.parse_vote(bill, action,
                                               act_chamber, act_date,
                                               cols[1].a['href'])
                        bill.add_vote(vote)
                    except:
                        self.log("Failed parsing vote at %s" %
                                 cols[1].a['href'])

                bill.add_action(act_chamber, action, act_date)

            # Get subjects
            bill['subjects'] = []
            subject_link_re = re.compile('.*subject=\w+$')
            for subject_link in info_page.findAll('a', href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill['subjects'].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session, bill_id)
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile('^get_bill_text?')
            for text_link in text_list.findAll('a', href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (
                    text_link['href'])

                bill.add_version(text_name, text_url)

            self.save_bill(bill)
Esempio n. 20
0
    def get_bill_info(self, session, sub, bill_id):
        bill_detail_url = (
            "http://www.ncga.state.nc.us/gascripts/"
            "BillLookUp/BillLookUp.pl?bPrintable=true"
            "&Session=%s&BillID=%s&votesToView=all" % (session[0:4] + sub, bill_id)
        )

        # parse the bill data page, finding the latest html text
        if bill_id[0] == "H":
            chamber = "lower"
        else:
            chamber = "upper"

        bill_data = self.urlopen(bill_detail_url)
        bill_soup = self.soup_parser(bill_data)

        bill_title = bill_soup.findAll(
            "div", style="text-align: center; font: bold" " 20px Arial; margin-top: 15px;" " margin-bottom: 8px;"
        )[0].contents[0]

        bill = Bill(session + sub, chamber, bill_id, bill_title)
        bill.add_source(bill_detail_url)

        # get all versions
        links = bill_soup.findAll("a", href=re.compile("/Sessions/%s/Bills/\w+/HTML" % session[0:4]))

        for link in links:
            version_name = link.parent.previousSibling.previousSibling
            version_name = version_name.contents[0].replace("&nbsp;", " ")
            version_name = version_name.replace(u"\u00a0", " ")

            version_url = "http://www.ncga.state.nc.us" + link["href"]
            bill.add_version(version_name, version_url)

        # figure out which table has sponsor data
        sponsor_table = bill_soup.findAll("th", text="Sponsors", limit=1)[0].findParents("table", limit=1)[0]

        sponsor_rows = sponsor_table.findAll("tr")
        for leg in sponsor_rows[1].td.findAll("a"):
            bill.add_sponsor("primary", leg.contents[0].replace(u"\u00a0", " "))
        for leg in sponsor_rows[2].td.findAll("a"):
            bill.add_sponsor("cosponsor", leg.contents[0].replace(u"\u00a0", " "))

        action_table = bill_soup.findAll("th", text="Chamber", limit=1)[0].findParents("table", limit=1)[0]

        for row in action_table.findAll("tr"):
            cells = row.findAll("td")
            if len(cells) != 3:
                continue

            act_date, actor, action = map(lambda x: self.flatten(x), cells)
            act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

            if actor == "Senate":
                actor = "upper"
            elif actor == "House":
                actor = "lower"
            elif action.endswith("Gov."):
                actor = "Governor"

            bill.add_action(actor, action, act_date)

        for vote in bill_soup.findAll("a", href=re.compile("RollCallVoteTranscript")):
            self.get_vote(bill, vote["href"])

        self.add_bill(bill)
Esempio n. 21
0
    def scrape_bill(self, chamber, current_bill, session):
        other_chamber = 'upper' if chamber == 'lower' else 'lower'
        with self.soup_context("http://alisondb.legislature.state.al.us/acas/SESSBillsStatusResultsMac.asp?BillNumber=%s&GetStatus=Get+Status&session=%s" % (current_bill, session[0])) as bill:
             if "Your ACAS Session has expired." in str(bill):
                 raise Exception("Expired cookie - you'll have to run with -n to skip caching")
             try:
                 bill_id = int(re.findall(r'BTN([0-9]+)', str(bill))[0])
             except:
                 raise Exception("No bill found. Hopefully that means it's the end of the session") 
             title = bill.find("td", {'colspan': '7'}).string
             self.log("Starting parse of %s" % current_bill)
             #create our bill!
             bill = Bill(session[1], chamber, current_bill, title.strip())

             #add sponsors and co-sponsors
             with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONSponsorsResultsMac.asp?OID=%d" % bill_id) as sponsors:
                 # This pains me.
                 (primary,secondary) = sponsors.findAll("table", text="Co-Sponsors")[0].parent.parent.parent.findAll('table')
                 for p in primary.findAll('td'):
                     bill.add_sponsor('primary', p.string)
                 for s in secondary.findAll('td'):
                     bill.add_sponsor('cosponsor', s.string)
             with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONHistoryResultsMac.asp?OID=%d" % bill_id) as history:
                  actions = history.findAll('table', text="Committee")[0].parent.parent.parent.findAll('tr')
                  #Date Amend/Subst Matter Committee Nay Yea Abs Vote
                  for event in actions:
                       e = event.findAll('td')
                       if len(e) == 0:
                           continue
                       date = e[0].string
                       amend = e[1].find('input')
                       matter = e[2].string
                       y_votes = e[5].string
                       n_votes = e[4].string
                       a_votes = e[6].string

                       if not matter:
                           continue

                       roll = e[7].find('input')
                       #(date, amend, matter, committee, nays, yeas, abs, vote_thing) = map(lambda x: x.string, e)
                       if date != None:
                           act_date = dt.datetime.strptime(date, '%m/%d/%Y')
                       if amend != None:
                           splitter = re.findall(r'documentSelected\(\'(\w*)\',\'([\w\d-]*)\',\'([\w\.\-]*)\',\'([\w\d/]*)\',\'([\w\d]*)\',\'([\w\s]*)\'', str(amend))[0]
                           amend = "http://alisondb.legislature.state.al.us/acas/%s/%s" % (splitter[3], splitter[2])
                           bill.add_document(matter, amend)

                       if roll != None: 
                          splitter = re.findall(r'voteSelected\(\'(\d*)\',\'(\d*)\',\'(\d*)\',\'(.*)\',\'(\d*)\'',str(roll))[0]
                          roll = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&SESS=%s" % (splitter[0], splitter[1], splitter[2], splitter[4])
                          with self.soup_context(roll) as votes:
                              vote_rows = votes.findAll('table', text='Member')[0].parent.parent.parent.findAll('tr')
                              
                              yea_votes = int(votes.findAll('tr', text='Total Yea:')[0].parent.parent.findAll('td')[2].string)
                              nay_votes = int(votes.findAll('tr', text='Total Nay:')[0].parent.parent.findAll('td')[2].string)
                              abs_votes = int(votes.findAll('tr', text='Total Abs:')[0].parent.parent.findAll('td')[2].string)
                              p_votes   = len(votes.findAll('tr', text='P'))
                              
                              #chamber, date, motion, passed, yes_count, no_count, other_count
                              vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes + p_votes)
                              
                              vote.add_source(roll)
                              for row in vote_rows:
                                  skip = str(row)
                                  if "Total Yea" in skip or "Total Nay" in skip or "Total Abs" in skip:
                                      continue
                                  html_layouts_are_awesome = row.findAll('td')
                                  if len(html_layouts_are_awesome) == 0:
                                      continue
	
                                  (name, t) = html_layouts_are_awesome[0].string, html_layouts_are_awesome[2].string
                                  self.dumb_vote(vote, name, t)
                                  
                                  if len(html_layouts_are_awesome) > 3:
                                      (name, t) = html_layouts_are_awesome[4].string, html_layouts_are_awesome[6].string
                                      self.dumb_vote(vote, name, t)
                              bill.add_vote(vote)

                       if y_votes != None:
                           yea_votes = self.dumber_vote(y_votes)
                           nay_votes = self.dumber_vote(n_votes)
                           abs_votes = self.dumber_vote(a_votes)
                           vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes)
                           bill.add_vote(vote)
                       
                       bill.add_action(chamber, matter, act_date)
             self.add_bill(bill)
Esempio n. 22
0
    def scrape_bills(self, chamber, year):
        """
        Scrape the ND bills considered in a given chamber during a given year.
        """
        # Error checking
        if year not in self.metadata['session_details']:
            raise NoDataForYear(year)
        
        # URL building
        if chamber == 'upper':
            url_chamber_name = 'senate'
            norm_chamber_name = 'Senate'
        else:
            url_chamber_name = 'house'
            norm_chamber_name = 'House'
        
        assembly_url = '/assembly/%i-%s' % (
            self.metadata['session_details'][str(year)]['number'],
            year)
        
        chamber_url = '/bill-text/%s-bill.html' % (url_chamber_name)
        
        list_url = self.site_root + assembly_url + chamber_url
        
        # Parsing
        soup = self.parser.parse(self.urlopen(list_url))
        
        if not soup:
            raise ScrapeError('Failed to parse legaslative list page.')
        
        table = soup.find('table', summary=norm_chamber_name + ' Bills')
        
        bill_links = table.findAll('a', href=re.compile('bill-actions'))
        indexed_bills = {}
        
        self.log('Scraping %s bills for %s.' % (norm_chamber_name, year))
        
        for link in bill_links:
            # Populate base attributes
            attributes = {
                'session': year,
                'chamber': chamber,
                }
            
            bill_number = link.contents[0]
            
            if not re.match('^[0-9]{4}$', bill_number):
                raise ScrapeError('Bill number not in expected format.')
            
            # ND bill prefixes are coded numerically
            if bill_number[0] == '1':
                bill_prefix = 'HB'
            elif bill_number[0] == '2':
                bill_prefix = 'SB'
            elif bill_number[0] == '3':
                bill_prefix = 'HCR'
            elif bill_number[0] == '4':
                bill_prefix = 'SCR'
            elif bill_number[0] == '5':
                bill_prefix = 'HR'
            elif bill_number[0] == '6':
                bill_prefix = 'SR'
            elif bill_number[0] == '7':
                bill_prefix = 'HMR'
            elif bill_number[0] == '8':
                bill_prefix = 'SMR'
                
            attributes['bill_id'] = bill_prefix + ' ' + bill_number
            
            # Skip duplicates (bill is listed once for each version)
            if attributes['bill_id'] in indexed_bills.keys():
                continue
            
            self.debug(attributes['bill_id'])
            
            # Parse details page                
            attributes.update(
                self.scrape_bill_details(assembly_url, bill_number))
        
            # Create bill
            bill = Bill(**attributes)
            
            # Parse actions      
            (actions, actions_url) = self.scrape_bill_actions(
                assembly_url, bill_number, year)
            bill.add_source(actions_url)
            
            for action in actions:
                bill.add_action(**action)

            # Parse versions
            (versions, versions_url) = self.scrape_bill_versions(
                assembly_url, bill_number)
            bill.add_source(versions_url)
            
            for version in versions:
                bill.add_version(**version)
            
            # Add bill to dictionary, indexed by its id
            indexed_bills[attributes['bill_id']] = bill
        
        # Parse sponsorship data
        if int(year) >= 2005:
            self.log('Scraping sponsorship data.')
            
            (sponsors, sponsors_url) = self.scrape_bill_sponsors(assembly_url)
            
            for bill_id, sponsor_list in sponsors.items():
                for sponsor in sponsor_list:
                    # Its possible a bill was misnamed somewhere... but thats
                    # not a good enough reason to error out
                    if bill_id in indexed_bills.keys():
                        bill = indexed_bills[bill_id]
                        bill.add_sponsor(**sponsor)
                        bill.add_source(sponsors_url)
        else:
            self.log('Sponsorship data not available for %s.' % year)
                
        self.log('Saving scraped bills.')
        
        # Save bill
        for bill in indexed_bills.values():
            self.save_bill(bill)
Esempio n. 23
0
    def scrape_new_session(self, chamber, session):
        """
        Scrapes SD's bill data from 2009 on.
        """

        if chamber == 'upper':
            bill_abbr = 'SB'
        elif chamber == 'lower':
            bill_abbr = 'HB'

        # Get bill list page
        session_url = 'http://legis.state.sd.us/sessions/%s/' % session
        bill_list_url = session_url + 'BillList.aspx'
        self.log('Getting bill list for %s %s' % (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Format of bill link contents
        bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr)
        date_re = re.compile('\d{2}/\d{2}/\d{4}')

        for bill_link in bill_list.findAll('a'):
            if len(bill_link.contents) == 0:
                # Empty link
                continue

            #print bill_link.contents[0]
            bill_match = bill_re.search(bill_link.contents[0])
            if not bill_match:
                continue

            # Parse bill ID and name
            bill_id = bill_link.contents[0].replace(u'\xa0', ' ')
            bill_name = bill_link.findNext().contents[0]

            # Download history page
            hist_url = session_url + bill_link['href']
            history = self.soup_parser(self.urlopen(hist_url))

            bill = Bill(session, chamber, bill_id, bill_name)
            bill.add_source(hist_url)

            # Get all bill versions
            text_table = history.findAll('table')[1]
            for row in text_table.findAll('tr')[2:]:
                #version_date = row.find('td').string
                version_path = row.findAll('td')[1].a['href']
                version_url = "http://legis.state.sd.us/sessions/%s/%s" % (
                    session, version_path)

                version_name = row.findAll('td')[1].a.contents[0].strip()

                bill.add_version(version_name, version_url)

            # Get actions
            act_table = history.find('table')
            for act_row in act_table.findAll('tr')[6:]:
                if act_row.find(text='Action'):
                    continue

                # Get the date (if can't find one then this isn't an action)
                date_match = date_re.match(act_row.td.a.contents[0])
                if not date_match:
                    continue
                act_date = date_match.group(0)
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

                # Get the action string
                action = ""
                for node in act_row.findAll('td')[1].contents:
                    if hasattr(node, 'contents'):
                        action += node.contents[0]

                        if node.contents[0].startswith('YEAS'):
                            # This is a vote!
                            vote_url = "http://legis.state.sd.us/sessions/"\
                                "%s/%s" % (session, node['href'])

                            vote = self.scrape_new_vote(vote_url)
                            vote['date'] = act_date
                            bill.add_vote(vote)
                    else:
                        action += node
                action = action.strip()

                # Add action
                bill.add_action(chamber, action, act_date)

            self.save_bill(bill)
Esempio n. 24
0
    def scrape_old_session(self, chamber, session):
        """
        Scrape SD's bill data from 1997 through 2008.
        """

        if chamber == 'upper':
            bill_abbr = 'SB'
        else:
            bill_abbr = 'HB'

        # Get bill list page (and replace malformed tags that some versions of
        # BeautifulSoup choke on)
        session_url = 'http://legis.state.sd.us/sessions/%s/' % session
        bill_list_url = session_url + 'billlist.htm'
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Bill and text link formats
        bill_re = re.compile('%s (\d+)' % bill_abbr)
        text_re = re.compile('/sessions/%s/bills/%s.*\.htm' % (
                session, bill_abbr), re.IGNORECASE)
        date_re = re.compile('\d{2}/\d{2}/\d{4}')

        for bill_link in bill_list.findAll('a', href=re.compile('\d\.htm$')):
            if len(bill_link.contents) == 0:
                # Empty link
                continue

            bill_match = bill_re.match(bill_link.contents[0])
            if not bill_match:
                # Not bill link
                continue

            # Get the bill ID and name
            bill_id = bill_link.contents[0]
            bill_name = bill_link.findNext().contents[0]

            # Get history page (replacing malformed tag)
            hist_url = session_url + bill_link['href']
            history = self.soup_parser(self.urlopen(hist_url))

            # Get URL of latest verion of bill (should be listed last)
            bill_url = history.findAll('a', href=text_re)[-1]['href']
            bill_url = 'http://legis.state.sd.us%s' % bill_url

            # Add bill
            bill = Bill(session, chamber, bill_id, bill_name)
            bill.add_source(hist_url)

            # Get bill versions
            text_table = history.findAll('table')[1]
            for row in text_table.findAll('tr')[2:]:
                #version_date = row.find('td').string
                version_path = row.findAll('td')[1].a['href']
                version_url = "http://legis.state.sd.us" + version_path

                version_name = row.findAll('td')[1].a.contents[0].strip()

                bill.add_version(version_name, version_url)

            # Get actions
            act_table = history.find('table')
            for act_row in act_table.findAll('tr')[6:]:
                if act_row.find(text="Action"):
                    continue

                # Get the date (if can't find one then this isn't an action)
                date_match = date_re.match(act_row.td.a.contents[0])
                if not date_match:
                    continue
                act_date = date_match.group(0)
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

                # Get the action string
                action = ""
                for node in act_row.findAll('td')[1].contents:
                    if hasattr(node, 'contents'):
                        action += node.contents[0]

                        if node.contents[0].startswith('YEAS'):
                            # This is a vote!
                            if node['href'][0] == '/':
                                vote_url = "http://legis.state.sd.us/%s" % (
                                    node['href'])
                            else:
                                vote_url = "http://legis.state.sd.us/"\
                                    "sessions/%s/%s" % (session, node['href'])

                            vote = self.scrape_old_vote(vote_url)
                            vote['date'] = act_date
                            bill.add_vote(vote)
                    else:
                        action += node
                action = action.strip()

                # Add action
                bill.add_action(chamber, action, act_date)

            self.save_bill(bill)
Esempio n. 25
0
    def scrape_session_old(self, chamber, session):
        if chamber == "lower":
            bill_abbr = "H."
            chamber_name = "House"
            other_chamber = "Senate"
        else:
            bill_abbr = "S."
            chamber_name = "Senate"
            other_chamber = "House"

        start_date = '1/1/%s' % session.split('-')[0]
        data = urllib.urlencode({'Date': start_date,
                                 'Body': bill_abbr[0],
                                 'Session': session.split('-')[1]})
        bill_list_url = "http://www.leg.state.vt.us/database/"\
            "rintro/results.cfm"
        bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data))

        bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0])
        for bill_link in bill_list.findAll('a', href=bill_link_re):
            bill_id = bill_link.string
            bill_title = bill_link.parent.parent.findAll('td')[1].string
            bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_source(bill_info_url)

            info_page = BeautifulSoup(self.urlopen(bill_info_url))

            text_links = info_page.findAll('blockquote')[-1].findAll('a')
            for text_link in text_links:
                bill.add_version(text_link.string,
                                 "http://www.leg.state.vt.us" +
                                 text_link['href'])

            sponsors = info_page.find(
                text='Sponsor(s):').parent.findNext('td').findAll('b')
            bill.add_sponsor('primary', sponsors[0].string)
            for sponsor in sponsors[1:]:
                bill.add_sponsor('cosponsor', sponsor.string)

            # Grab actions from the originating chamber
            act_table = info_page.find(
                text='%s Status:' % chamber_name).findNext('table')
            for row in act_table.findAll('tr')[3:]:
                action = row.td.string.replace('&nbsp;', '').strip(':')

                act_date = row.findAll('td')[1].b.string.replace('&nbsp;', '')
                if act_date != "":
                    detail = row.findAll('td')[2].b
                    if detail and detail.string != "":
                        action += ": %s" % detail.string.replace('&nbsp;', '')
                    bill.add_action(chamber, action, act_date)

            # Grab actions from the other chamber
            act_table = info_page.find(
                text='%s Status:' % other_chamber).findNext('table')
            if act_table:
                if chamber == 'upper':
                    act_chamber = 'lower'
                else:
                    act_chamber = 'upper'
                for row in act_table.findAll('tr')[3:]:
                    action = row.td.string.replace('&nbsp;', '').strip(':')

                    act_date = row.findAll('td')[1].b.string.replace(
                        '&nbsp;', '')
                    if act_date != "":
                        detail = row.findAll('td')[2].b
                        if detail and detail.string != "":
                            action += ": %s" % detail.string.replace(
                                '&nbsp;', '')
                        date = dt.datetime.strptime(act_date, '%m/%d/%Y')
                        bill.add_action(act_chamber, action, act_date)

            self.save_bill(bill)
Esempio n. 26
0
    def get_bill_info(self, session, sub, bill_id):
        bill_detail_url = 'http://www.ncga.state.nc.us/gascripts/'\
            'BillLookUp/BillLookUp.pl?bPrintable=true'\
            '&Session=%s&BillID=%s&votesToView=all' % (
            session[0:4] + sub, bill_id)

        # parse the bill data page, finding the latest html text
        if bill_id[0] == 'H':
            chamber = 'lower'
        else:
            chamber = 'upper'

        bill_data = self.urlopen(bill_detail_url)
        bill_soup = self.soup_parser(bill_data)

        bill_title = bill_soup.findAll('div',
                                       style="text-align: center; font: bold"
                                       " 20px Arial; margin-top: 15px;"
                                       " margin-bottom: 8px;")[0].contents[0]

        bill = Bill(session + sub, chamber, bill_id, bill_title)
        bill.add_source(bill_detail_url)

        # get all versions
        links = bill_soup.findAll('a', href=re.compile(
                '/Sessions/%s/Bills/\w+/HTML' % session[0:4]))

        for link in links:
            version_name = link.parent.previousSibling.previousSibling
            version_name = version_name.contents[0].replace('&nbsp;', ' ')
            version_name = version_name.replace(u'\u00a0', ' ')

            version_url = 'http://www.ncga.state.nc.us' + link['href']
            bill.add_version(version_name, version_url)

        # figure out which table has sponsor data
        sponsor_table = bill_soup.findAll('th', text='Sponsors',
                                          limit=1)[0].findParents(
            'table', limit=1)[0]

        sponsor_rows = sponsor_table.findAll('tr')
        for leg in sponsor_rows[1].td.findAll('a'):
            bill.add_sponsor('primary',
                             leg.contents[0].replace(u'\u00a0', ' '))
        for leg in sponsor_rows[2].td.findAll('a'):
            bill.add_sponsor('cosponsor',
                             leg.contents[0].replace(u'\u00a0', ' '))

        action_table = bill_soup.findAll('th', text='Chamber',
                                         limit=1)[0].findParents(
            'table', limit=1)[0]

        for row in action_table.findAll('tr'):
            cells = row.findAll('td')
            if len(cells) != 3:
                continue

            act_date, actor, action = map(lambda x: self.flatten(x), cells)
            act_date = dt.datetime.strptime(act_date, '%m/%d/%Y')

            if actor == 'Senate':
                actor = 'upper'
            elif actor == 'House':
                actor = 'lower'
            elif action.endswith('Gov.'):
                actor = 'Governor'

            bill.add_action(actor, action, act_date)

        for vote in bill_soup.findAll('a', href=re.compile(
                'RollCallVoteTranscript')):
            self.get_vote(bill, vote['href'])

        self.save_bill(bill)
Esempio n. 27
0
    def scrape_bills(self, chamber, year):
        session = "%s%d" % (year, int(year) + 1)
        if not session in self.metadata['sessions']:
            raise NoDataForYear(year)

        if chamber == 'upper':
            measure_abbr = 'SB'
            chamber_name = 'SENATE'
            house_type = 'S'
        else:
            measure_abbr = 'AB'
            chamber_name = 'ASSEMBLY'
            house_type = 'A'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=measure_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id
            version = self.session.query(CABillVersion).filter_by(
                bill=bill).filter(CABillVersion.bill_xml != None).first()
            if not version:
                # not enough data to import
                continue

            fsbill = Bill(bill_session, chamber, bill_id,
                          version.title,
                          short_title=version.short_title)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                fsbill.add_action(actor, action.action, action.action_date)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    vote_chamber = ''
                    vote_location = full_loc

                fsvote = Vote(vote_chamber,
                              vote.vote_date_time,
                              vote.motion.motion_text or '',
                              result,
                              vote.ayes, vote.noes, vote.abstain,
                              threshold=vote.threshold,
                              location=vote_location)

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                fsbill.add_vote(fsvote)

            self.add_bill(fsbill)
Esempio n. 28
0
    def scrape_session_new(self, chamber, session):
        if chamber == "lower":
            bill_abbr = "H."
        else:
            bill_abbr = "S."

        bill_list_path = "docs/bills.cfm?Session=%s&Body=%s" % (
            session.split('-')[1], bill_abbr[0])
        bill_list_url = "http://www.leg.state.vt.us/" + bill_list_path
        bill_list = BeautifulSoup(self.urlopen(bill_list_url))

        bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0])
        for bill_link in bill_list.findAll('a', href=bill_link_re):
            bill_id = bill_link.string
            bill_title = bill_link.parent.findNext('b').string
            bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_source(bill_info_url)

            info_page = BeautifulSoup(self.urlopen(bill_info_url))

            text_links = info_page.findAll('blockquote')[1].findAll('a')
            for text_link in text_links:
                bill.add_version(text_link.string,
                                 "http://www.leg.state.vt.us" +
                                 text_link['href'])

            act_table = info_page.findAll('blockquote')[2].table
            for row in act_table.findAll('tr')[1:]:
                action = ""
                for s in row.findAll('td')[1].findAll(text=True):
                    action += s + " "
                action = action.strip()

                match = re.search('Governor on (.*)$', action)
                if match:
                    act_date = parse_exec_date(match.group(1).strip())
                    actor = 'Governor'
                else:
                    if row['bgcolor'] == 'Salmon':
                        actor = 'lower'
                    else:
                        actor = 'upper'

                    if row.td.a:
                        act_date = row.td.a.string
                    else:
                        act_date = row.td.string

                    act_date = re.search(
                        '\d{1,2}/\d{1,2}/\d{4,4}', act_date).group(0)
                    act_date = dt.datetime.strptime(act_date, '%m/%d/%Y')

                bill.add_action(actor, action, act_date)

                vote_link = row.find('a', text='Details')
                if vote_link:
                    vote_url = vote_link.parent['href']
                    self.parse_vote_new(bill, actor, vote_url)

            sponsors = info_page.find(
                text='Sponsor(s):').parent.parent.findAll('b')
            bill.add_sponsor('primary', sponsors[0].string)
            for sponsor in sponsors[1:]:
                bill.add_sponsor('cosponsor', sponsor.string)

            self.save_bill(bill)
Esempio n. 29
0
    def scrape_session(self, chamber, session):
        if chamber == 'upper':
            chamber_name = 'Senate'
            bill_abbr = 'S'
        elif chamber == 'lower':
            chamber_name = 'House'
            bill_abbr = 'H'

        # Base url for bills sorted by first letter of title
        base_url = 'http://www.flsenate.gov/Session/'\
            'index.cfm?Mode=Bills&BI_Mode=ViewBySubject&'\
            'Letter=%s&Year=%s&Chamber=%s'

        # Bill ID format
        bill_re = re.compile("%s (\d{4}[ABCDEO]?)" % bill_abbr)

        # Go through all sorted bill list pages
        for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
            bill_list_url = base_url % (letter, session.replace(' ', ''),
                                        chamber_name)
            self.log("Getting bill list for %s %s (%s)" % (chamber, session,
                                                           letter))
            bill_list = BeautifulSoup(self.urlopen(bill_list_url))

            # Bill ID's are bold
            for b in bill_list.findAll('b'):
                if not b.string:
                    continue

                match = bill_re.search(b.string)
                if match:
                    # Bill ID and number
                    bill_id = match.group(0)
                    bill_number = match.group(1)

                    # Get bill name and info url
                    bill_link = b.parent.findNext('td').a
                    bill_name = bill_link.string.strip()
                    info_url = "http://www.flsenate.gov/Session/%s&Year=%s" % (
                        bill_link['href'], session.replace(' ', ''))

                    # Create bill
                    bill = Bill(session, chamber, bill_id, bill_name)
                    bill.add_source(info_url)

                    # Get bill info page
                    info_page = BeautifulSoup(self.urlopen(info_url))

                    # Get all bill versions
                    bill_table = info_page.find(
                        'a',
                        attrs={'name': 'BillText'}).parent.parent.findNext(
                        'tr').td.table

                    if bill_table:
                        for tr in bill_table.findAll('tr')[1:]:
                            version_name = tr.td.string
                            version_url = "http://www.flsenate.gov%s" % (
                                tr.a['href'])
                            bill.add_version(version_name, version_url)

                    # Get actions
                    hist_table = info_page.find(
                        'tr', 'billInfoHeader').findPrevious('tr')
                    hist = ""
                    for line in hist_table.findAll(text=True):
                        hist += line + "\n"
                    hist = hist.replace('&nbsp;', ' ')
                    act_re = re.compile(r'^  (\d\d/\d\d/\d\d) (SENATE|HOUSE)'
                                        '(.*\n(\s{16,16}.*\n){0,})',
                                        re.MULTILINE)

                    for act_match in act_re.finditer(hist):
                        action = act_match.group(3).replace('\n', ' ')
                        action = re.sub('\s+', ' ', action).strip()
                        if act_match.group(2) == 'SENATE':
                            act_chamber = 'upper'
                        else:
                            act_chamber = 'lower'

                        act_date = act_match.group(1)
                        act_date = dt.datetime.strptime(act_date, '%m/%d/%y')

                        for act_text in re.split(' -[HS]J \d+;? ?', action):
                            if not act_text:
                                continue

                            bill.add_action(act_chamber, act_text, act_date)

                    # Get primary sponsor
                    # Right now we just list the committee as the primary
                    # sponsor for committee substituts. In the future,
                    # consider listing committee separately and listing the
                    # original human sponsors as primary
                    spon_re = re.compile('by ([^;(\n]+;?|\w+)')
                    sponsor = spon_re.search(hist).group(1).strip('; ')
                    bill.add_sponsor('primary', sponsor)

                    # Get co-sponsors
                    cospon_re = re.compile(r'\((CO-SPONSORS|CO-AUTHORS)\) '
                                           '([\w .]+(;[\w .\n]+){0,})',
                                           re.MULTILINE)
                    cospon_match = cospon_re.search(hist)
                    if cospon_match:
                        for cosponsor in cospon_match.group(2).split(';'):
                            cosponsor = cosponsor.replace('\n', '').strip()
                            bill.add_sponsor('cosponsor', cosponsor)

                    self.add_bill(bill)