Beispiel #1
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('#legislation h1')[0].text_content().strip()

            bill_id = name.split(' - ')[0].strip()

            bill = Bill(session, chamberName, bill_id, name)

            # Sponsorships
            for a in page.cssselect("#sponsors a"):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in page.cssselect('#history tr')[1:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()

                if '/' not in date:
                    continue

                date = datetime.datetime.strptime(date, '%m/%d/%Y')

                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in page.cssselect('#versions a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Beispiel #2
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table')

            # Bill
            name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[2].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Beispiel #3
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('center table center')[0]

            for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table')[-1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Beispiel #4
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution'}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                    if insert.find('Special') != -1:
                        session = insert
                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type)
                    bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)


                    primary, secondary = self.scrape_sponsors(page)
                    
                    if primary[0] == 'By:':
                        primary.pop(0)
                        
                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    minutes_count = 2
                    for mr in root.xpath('//table[4]/tr/td[3]/a'):
                        minutes =  mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1


                    self.scrape_actions(root, bill, "lower")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Beispiel #5
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Beispiel #6
0
 def scrape_session_2009(self, chamber, session):
     url, type = bills_url(chamber)
                 
     with self.urlopen(url) as page_html:
         page = lxml.html.fromstring(page_html)
         for element, attribute, link, pos in page.iterlinks():         
             if re.search("billtype=" + type + "&billnumber=[0-9]+", link) != None:
                 bill_page_url = bill_url(link)
                 with self.urlopen(bill_page_url) as bill_page_str:
                     bill_page = lxml.html.fromstring(bill_page_str)
                     splitted_link = link.split("=")
                     bill_number = splitted_link[-1]
                     bill_id = bill_page.cssselect('a[class="headerlink"]')
                     bill_id = bill_id[0]
                     bill_id = bill_id.text_content()
                     bill_title = bill_page.cssselect('td[style="color:Black"]')
                     bill_title = bill_title[0]
                     bill_title = bill_title.text_content()
                     bill = Bill(session, chamber, bill_id, bill_title)
                     bill.add_source(bill_page_url)
                     
                     actions_table_list = bill_page.cssselect('table[rules="all"]')
                     actions_table = actions_table_list[0]
                     action_elements = actions_table.cssselect('tr')
                     # first element is not an action element
                     action_elements.pop(0)
                     
                     for ae in action_elements:
                         action_element_parts = ae.cssselect('td')
                         
                         action_date = action_element_parts[0]
                         actor_house = action_element_parts[1]
                         action_text = action_element_parts[2]
                         
                         # look for acting comittees
                         match = re.search("(committee\(s\)|committee) on ([A-Z]{3}(/|-)[A-Z]{3}|[A-Z]{3})", action_text.text_content())
 
                         if(match != None):
                             actor = match.group(0)
                         elif(actor_house == 'H'):
                             actor = "lower"
                         elif (actor_house == 'S'):
                             actor = "upper"
                         else:
                             actor = chamber
                         
                         action_date = dt.datetime.strptime(action_date.text_content(), '%m/%d/%Y') 
                             
                         if (re.search("The votes were as follows", action_text.text_content()) != None):
                             self.scrape_votes(action_text.text_content(), bill_page_url, actor_house, action_date, bill)
                                                        
                         bill.add_action(actor, action_text, action_date)
                     
                     with self.urlopen(versions_page_url(type, bill_number)) as versions_page_html:
                         versions_page = lxml.html.fromstring(versions_page_html)
                         versions_elements = versions_page.cssselect('span[class="searchtitle"]')
                         for ve in versions_elements:
                             element_text = ve.text_content()
                             version_name = element_text.rstrip("_.HTM")
                             bill.add_version(version_name, bill_version_url(element_text))
Beispiel #7
0
    def scrape_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            header = page.xpath('//h3/br')[0].tail.replace(' ', ' ')
            title, primary_sponsor = header.split(' -- ')

            if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
                bill_type = ['bill']
            elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
                bill_type = ['concurrent resolution']
            elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
                bill_type = ['joint resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_sponsor('primary', primary_sponsor)
            bill.add_source(url)

            status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
            self.parse_status(bill, status_link.attrib['href'])

            for link in page.xpath(
                '//a[contains(@href, "bills/") and text() = "HTML"]'):

                name = link.getprevious().tail.strip()
                bill.add_version(name, link.attrib['href'])

            self.save_bill(bill)
Beispiel #8
0
    def scrape_bill(self, session, chamber, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                return
            title = title.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Beispiel #9
0
 def scrape(self, chamber, session):   
     year = year_from_session(session)
     url = bills_url(year)
     with self.urlopen(url) as bills_page_html:
         bills_page = lxml.html.fromstring(bills_page_html)
         table_rows = bills_page.cssselect('tr')
         # Eliminate empty rows
         table_rows = table_rows[0:len(table_rows):2]
         for row in table_rows:
             row_elements = row.cssselect('td')
             
             bill_document = row_elements[0]
             bill_document.make_links_absolute(base_url())
             
             element, attribute, link, pos = bill_document.iterlinks().next()
             bill_id = element.text_content().rstrip('.pdf')
             bill_document_link = link           
             
             title_and_sponsors = row_elements[1]
             title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content())
             sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content())
             title = title_match.group(1)
             sponsors =  sponsors_match.group(1)
             separated_sponsors = sponsors.split('--')
             
             bill = Bill(session, chamber, bill_id, title)
             bill.add_version('current', bill_document_link)
             
             if separated_sponsors[1] == '(NONE)':
                 bill.add_sponsor('primary', separated_sponsors[0])
             
             else:
                 bill.add_sponsor('cosponsor', separated_sponsors[0])
                 bill.add_sponsor('cosponsor', separated_sponsors[1])                
            
             
             versions_page_element = row_elements[2]
             versions_page_element.make_links_absolute(base_url())
             element, attribute, link, pos = versions_page_element.iterlinks().next()
             
             bill.add_source(link)
             
             self.scrape_versions(link, bill)
                   
             actions_page_element = row_elements[3]
             element, attribute, link, pos = actions_page_element.iterlinks().next()
             frame_link = base_url() + link.split('?Open&target=')[1]
             
             self.scrape_actions(frame_link, bill)
             
             votes_page_element = row_elements[7]
             element, attribute, link, pos = votes_page_element.iterlinks().next()
             frame_link = base_url() + link.split('?Open&target=')[1]
             self.scrape_votes(frame_link, chamber, bill)
                             
                     
                        
                     
                 
             
Beispiel #10
0
    def scrape_bill(self, bill_url, chamber, session):
        with self.urlopen(bill_url) as text:
            if "Specified Bill could not be found" in text:
                return False
            page = lxml.html.fromstring(text)
            page.make_links_absolute(bill_url)

            bill_id = page.xpath("string(//h2)").split()[0]

            summary = page.xpath(
                "string(//*[starts-with(text(), 'Summary: ')])")
            summary = summary.replace('Summary: ', '')

            match = re.match(r"^([^:]+): ([^(]+)", summary)

            if match:
                subjects = [match.group(1).strip()]
                title = match.group(2).strip()
            else:
                raise ScrapeError("Bad title")

            bill = Bill(session, chamber, bill_id, title,
                        subjects=subjects)
            bill.add_source(bill_url)

            history_link = page.xpath("//a[text() = 'History']")[0]
            history_url = history_link.attrib['href']
            self.scrape_history(bill, history_url)

            authors_link = page.xpath("//a[text() = 'Authors']")[0]
            authors_url = authors_link.attrib['href']
            self.scrape_authors(bill, authors_url)

            try:
                versions_link = page.xpath(
                    "//a[text() = 'Text - All Versions']")[0]
                versions_url = versions_link.attrib['href']
                self.scrape_versions(bill, versions_url)
            except IndexError:
                # Only current version
                try:
                    version_link = page.xpath(
                        "//a[text() = 'Text - Current']")[0]
                    version_url = version_link.attrib['href']
                    bill.add_version("%s Current" % bill_id, version_url)
                except IndexError:
                    # Some bills don't have any versions :(
                    pass

            try:
                votes_link = page.xpath("//a[text() = 'Votes']")[0]
                self.scrape_votes(bill, votes_link.attrib['href'])
            except IndexError:
                # Some bills don't have any votes
                pass

            self.save_bill(bill)

            return True
Beispiel #11
0
    def scrape(self, chamber, year):
        # Data prior to 1997 is contained in pdfs
        if year < "1997":
            raise NoDataForYear(year)

        bills_url = "http://www.leg.state.co.us/CLICS/CLICS" + year + "A/csl.nsf/%28bf-1%29?OpenView&Count=2000"
        with self.lxml_context(bills_url) as bills_page:
            table_rows = bills_page.cssselect("tr")
            # Eliminate empty rows
            table_rows = table_rows[0 : len(table_rows) : 2]
            for row in table_rows:
                print "row"
                row_elements = row.cssselect("td")

                bill_document = row_elements[0]
                bill_document.make_links_absolute("http://www.leg.state.co.us")

                element, attribute, link, pos = bill_document.iterlinks().next()
                bill_id = element.text_content().rstrip(".pdf")
                bill_document_link = link

                title_and_sponsors = row_elements[1]
                title_match = re.search("([A-Z][a-z]+.+[a-z])[A-Z]", title_and_sponsors.text_content())
                sponsors_match = re.search("[a-z]([A-Z]+.+)", title_and_sponsors.text_content())
                title = title_match.group(1)
                sponsors = sponsors_match.group(1)
                separated_sponsors = sponsors.split("--")

                bill = Bill(year, chamber, bill_id, title)
                bill.add_version("current", bill_document_link)

                if separated_sponsors[1] == "(NONE)":
                    bill.add_sponsor("primary", separated_sponsors[0])

                else:
                    bill.add_sponsor("cosponsor", separated_sponsors[0])
                    bill.add_sponsor("cosponsor", separated_sponsors[1])

                versions_page_element = row_elements[2]
                versions_page_element.make_links_absolute("http://www.leg.state.co.us")
                element, attribute, link, pos = versions_page_element.iterlinks().next()

                bill.add_source(link)

                self.scrape_versions(link, bill)

                actions_page_element = row_elements[3]
                element, attribute, link, pos = actions_page_element.iterlinks().next()
                frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1]

                self.scrape_actions(frame_link, bill)

                votes_page_element = row_elements[7]
                element, attribute, link, pos = votes_page_element.iterlinks().next()
                frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1]

                self.scrape_votes(link, chamber, bill)
Beispiel #12
0
    def scrape_year(self, year, chamber):    
        
        sep = '<h1>House</h1>'
            
        if chamber == 'upper':
            after = False
            reg = '[5-9]'
        else:
            after = True
            reg = '[1-4]'
                
        with self.lxml_context("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + str(year), sep, after) as page:
            for element, attribute, link, pos in page.iterlinks():
                if re.search("bill=" + reg + "[0-9]{3}", link) != None:
                    bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link
                    with self.lxml_context(bill_page_url) as bill_page:
                        raw_title = bill_page.cssselect('title')
                        split_title = string.split(raw_title[0].text_content(), ' ')
                        bill_id = split_title[0] + ' ' + split_title[1]
                        bill_id = bill_id.strip()
                        session = split_title[3].strip()

                        title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle")
                        title = title_element.text_content()

                        bill = Bill(session, chamber, bill_id, title)
                        bill.add_source(bill_page_url)
                
                        self.scrape_actions(bill_page, bill)
                
                        for element, attribute, link, pos in bill_page.iterlinks():
                            if re.search("billdocs", link) != None:
                                if re.search("Amendments", link) != None:
                                    bill.add_document("Amendment: " + element.text_content(), link)    
                                elif re.search("Bills", link) != None:
                                    bill.add_version(element.text_content(), link) 
                                else:
                                    bill.add_document(element.text_content(), link)
                            elif re.search("senators|representatives", link) != None:
                                with self.lxml_context(link) as senator_page:
                                    try:
                                        name_tuple = self.scrape_legislator_name(senator_page)
                                        bill.add_sponsor('primary', name_tuple[0])
                                    except:
                                        pass
                            elif re.search("ShowRollCall", link) != None:
                                match = re.search("([0-9]+,[0-9]+)", link)
                                match = match.group(0)
                                match = match.split(',')
                                id1 = match[0]
                                id2 = match[1]
                                url = "http://flooractivityext.leg.wa.gov/rollcall.aspx?id=" + id1 + "&bienId=" +id2
                                with self.lxml_context(url) as vote_page:
                                    self.scrape_votes(vote_page, bill, url)
                                    
                        self.save_bill(bill)
Beispiel #13
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == "lower":
            bill_abbr = "HB"
        else:
            bill_abbr = "SB"

        bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % (
            session.replace(' ', ''))
        self.log("Getting bill list for %s, %s" % (session, chamber))

        try:
            base_bill_list = self.soup_parser(self.urlopen(bill_list_url))
        except:
            # this session doesn't exist for this year
            return

        bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr)

        for link in base_bill_list.findAll('a', href=bill_list_link_re):
            bill_list = self.soup_parser(self.urlopen(link['href']))
            bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr)

            for bill_link in bill_list.findAll('a', href=bill_link_re):
                bill_id = bill_link.find(text=True).strip()

                bill_info_url = bill_link['href']
                bill_info = self.soup_parser(self.urlopen(bill_info_url))

                bill_title, primary_sponsor = bill_info.h3.contents[2].replace(
                    '&nbsp;', ' ').strip().split(' -- ')

                bill = Bill(session, chamber, bill_id, bill_title)
                bill.add_source(bill_info_url)
                bill.add_sponsor('primary', primary_sponsor)

                status_re = re.compile('.*billsta/%s.*.htm' %
                                       bill_abbr.lower())
                status_link = bill_info.find('a', href=status_re)

                if status_link:
                    self.parse_status(bill, status_link['href'])

                text_find = bill_info.find(
                    text="Bill Text (If you are having trouble viewing")

                if text_find:
                    text_link_re = re.compile('.*\.htm')
                    for text_link in text_find.parent.parent.findAll(
                        'a', href=text_link_re)[1:]:
                        version_name = text_link.previous.strip()
                        bill.add_version(version_name, text_link['href'])

                self.save_bill(bill)
Beispiel #14
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = '%s/bill.doc' % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext('p').contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version",
                             session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile(
                    'legislator/[SH]\d+\.htm'))

            for sponsor_link in sponsor_links:
                bill.add_sponsor('primary', sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext('p')[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = action.split('-')[0]
                action_date = dt.datetime.strptime(action_date, '%b %d')
                # Fix:
                action_date = action_date.replace(
                    year=int('20' + session[2:4]))

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
            if vote_link:
                bill.add_document(
                    'vote_history.pdf',
                    bill_info_url.replace('.htm', '') + "/vote_history.pdf")

            self.save_bill(bill)
Beispiel #15
0
    def scrape_bill(self, chamber, session, billid, histurl, year):
        if year[0] != 'R':
            session = year
        else:
            session = self.metadata['session_details'][year][
                'sub_sessions'][int(year[0]) - 1]

        with self.urlopen(histurl) as data:
            soup = BeautifulSoup(cleansource(data))
            basicinfo = soup.findAll('div', id='bhistleft')[0]
            hist = basicinfo.table

            sponsor = None
            title = None
            for b in basicinfo.findAll('b'):
                if b.next.startswith('SUMMARY'):
                    title = b.findNextSiblings(text=True)[0].strip()
                elif b.next.startswith('SPONSOR'):
                    for a in b.findNextSiblings('a'):
                        if not issponsorlink(a):
                            break
                        sponsor = cleansponsor(a.contents[0])

            bill = Bill(session, chamber, billid, title)

            if sponsor:
                bill.add_sponsor('primary', sponsor)

            for row in hist.findAll('tr'):
                link = row.td.a
                vlink = urlbase % link['href']
                vname = link.contents[0].strip()
                bill.add_version(vname, vlink)

            history = soup.findAll('div', id='bhisttab')[0].table
            rows = history.findAll('tr')[1:]
            for row in rows:
                tds = row.findAll('td')
                if len(tds) < 2:
                    # This is not actually an action
                    continue
                date, action = row.findAll('td')[:2]
                date = dt.datetime.strptime(date.contents[0], '%m/%d/%y')
                action = action.contents[0].strip()
                if 'House' in action:
                    actor = 'lower'
                elif 'Senate' in action:
                    actor = 'upper'
                else:  # for lack of a better
                    actor = chamber

                bill.add_action(actor, action, date)

        self.save_bill(bill)
Beispiel #16
0
    def scrape(self, chamber, session):
        self.site_id = self.metadata['session_details'][session]['internal_id']
        chamber_piece = {'upper': 'Senate',
                         'lower': 'House+of+Representatives'}[chamber]

        # resolutions
        # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction

        url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece

        self.refresh_session()

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # bills are all their own table with cellspacing=4 (skip first)
            bill_tables = doc.xpath('//table[@cellspacing="4"]')
            for bt in bill_tables[1:]:

                # each table has 3 rows: detail row, description, blank
                details, desc, _ = bt.xpath('tr')

                # first <tr> has img, button, sponsor, topic, current house
                #   current status, committee, committee2, last action
                _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td')

                # pull bill_id out of script tag (gross)
                bill_id = bill_id_re.search(button.text_content()).group()
                oid = btn_re.search(button.text_content()).groups()[0]

                sponsor = sponsor.text_content()
                topic = topic.text_content()
                com1 = com1.text_content()
                com2 = com2.text_content()
                desc = desc.text_content()

                # create bill
                bill = Bill(session, chamber, bill_id, desc.strip(),
                            topic=topic)
                bill.add_sponsor(sponsor, 'primary')

                self.get_sponsors(bill, oid)
                self.get_actions(bill, oid)

                # craft bill URL
                session_fragment = '2010rs'
                type_fragment = 'bills'
                bill_id_fragment = bill_id.lower()
                bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % (
                    session_fragment, type_fragment, bill_id_fragment)
                bill.add_version('bill text', bill_text_url)

                self.save_bill(bill)
Beispiel #17
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            bill_soup = BeautifulSoup(bill_html)

            bill_id = self.extract_bill_id(bill_soup)
            bill_title =  self.extract_bill_title(bill_soup)
            bill = Bill(session, chamber, bill_id, bill_title)

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.

        with self.urlopen(version_list_url) as version_html:
            version_soup = BeautifulSoup(version_html)

            # MN bills can have multiple versions.  Get them all, and loop over
            # the results, adding each one.
            self.debug("Extracting bill versions from: " + version_list_url)
            bill_versions = self.extract_bill_versions(version_soup)
            for version in bill_versions:
                version_name = version['name']
                version_url = urlparse.urljoin(VERSION_URL_BASE, version['url'])
                bill.add_version(version_name, version_url)

            # grab primary and cosponsors
            # MN uses "Primary Author" to name a bill's primary sponsor.
            # Everyone else listed will be added as a 'cosponsor'.
            sponsors = self.extract_bill_sponsors(bill_soup)
            primary_sponsor = sponsors[0]
            cosponsors = sponsors[1:]
            bill.add_sponsor('primary', primary_sponsor)
            for leg in cosponsors:
                bill.add_sponsor('cosponsor', leg)

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(bill_soup, chamber)
            for action in bill_actions:
                action_chamber = action['action_chamber']
                action_date = action['action_date']
                action_text = action['action_text']
                bill.add_action(action_chamber, action_text, action_date)

        self.save_bill(bill)
Beispiel #18
0
    def scrape1995(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip()
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            rows = page.cssselect('center table tr')
            for row in rows:
                if row.text_content().strip() == 'Sponsor and CoSponsors':
                    continue
                if row.text_content().strip() == 'Links / Committees / Status':
                    break
                for a in row.cssselect('a'):
                    bill.add_sponsor('', a.text_content().strip())

            # Actions
            # The actions are in a pre table that looks like:
            """    SENATE                         HOUSE
                   -------------------------------------
                 1/13/95   Read 1st time          2/6/95
                 1/31/95   Favorably Reported
                 2/1/95    Read 2nd Time          2/7/95
                 2/3/95    Read 3rd Time
                 2/3/95    Passed/Adopted                   """

            actions = page.cssselect('pre')[0].text_content().split('\n')
            actions = actions[2:]
            for action in actions:
                senate_date = action[:22].strip()
                action_text = action[23:46].strip()
                house_date = action[46:].strip()

                if '/' not in senate_date and '/' not in house_date:
                    continue

                if senate_date:
                    bill.add_action('upper', action_text, senate_date)

                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Beispiel #19
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {'F': 'bill', 'R':'resolution',
                         'C': 'concurrent resolution'}[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip())

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'])

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            version_doc = lxml.html.fromstring(version_html)
            for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'):
                version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href'))
                bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)
Beispiel #20
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            bill_no = 1
            abbr = 'SB'
        else:
            bill_no = 4001
            abbr = 'HB'
        while True:
            bill_page = self.scrape_bill(session, abbr, bill_no)
            bill_page = BeautifulSoup(bill_page)
            # if we can't find a page, we must be done. This is a healthy thing.
            if bill_page == None: return
            title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0]))
            title = title.replace('\n','').replace('\r','')
            bill_id = "%s %d" % (abbr, bill_no)

            the_bill = Bill(session, chamber, bill_id, title)

            #sponsors
            first = 0
            for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'):
                the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string)
                first = 1

            #versions
            for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'):
                r = self.parse_doc(the_bill, doc)
                if r: the_bill.add_version(*r)

            #documents
            if 'frg_billstatus_HlaTable' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)
            if 'frg_billstatus_SfaSection' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)

            self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0])
            self.save_bill(the_bill)
            bill_no = bill_no + 1
        pass
Beispiel #21
0
    def scrape_bill(self, bill_url, chamber, session):
        with self.urlopen(bill_url) as text:
            if "Specified Bill could not be found" in text:
                return False
            page = lxml.html.fromstring(text)
            page.make_links_absolute(bill_url)

            bill_id = page.xpath("string(//h2)").split()[0]

            summary = page.xpath(
                "string(//*[starts-with(text(), 'Summary: ')])")
            summary = summary.replace('Summary: ', '')

            bill = Bill(session, chamber, bill_id, summary)

            history_link = page.xpath("//a[text() = 'History']")[0]
            history_url = history_link.attrib['href']
            self.scrape_history(bill, history_url)

            authors_link = page.xpath("//a[text() = 'Authors']")[0]
            authors_url = authors_link.attrib['href']
            self.scrape_authors(bill, authors_url)

            try:
                versions_link = page.xpath(
                    "//a[text() = 'Text - All Versions']")[0]
                versions_url = versions_link.attrib['href']
                self.scrape_versions(bill, versions_url)
            except IndexError:
                # Only current version
                try:
                    version_link = page.xpath(
                        "//a[text() = 'Text - Current']")[0]
                    version_url = version_link.attrib['href']
                    bill.add_version("%s Current" % bill_id, version_url)
                except IndexError:
                    # Some bills don't have any versions :(
                    pass

            self.save_bill(bill)

            return True
Beispiel #22
0
def parse_bill(scraper, url):
    """Given a bill status URL, return a fully loaded Bill object, except for votes, which
       are expected to be handled externally.
    """
    session = extract_session(url)
    chamber = chamber_for_doctype(extract_doctype(url))
    s = get_soup(scraper, url)
    bill_id = extract_bill_id(s)
    landmark = s(text=re.compile(".*Short Description.*"))
    name_span = landmark[0].findParent().findNextSibling()
    bill_name = get_text(name_span)
    bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url)
    actions = extract_actions(s)
    for chamber,action,date in actions:
        bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em.  
    sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions])
    for type,namelist in sponsor_dict.iteritems():
        for name in namelist:
            bill.add_sponsor(type,name)
    for name,link in extract_versions(scraper, s):
        bill.add_version(name,link)
    return bill
Beispiel #23
0
    def scrape_bill(self, chamber, session, url):
        url = url + "&Year=%s" % session
        with self.urlopen(url) as page:
            page = page.replace('&nbsp;', ' ').replace('<br>', '\n')
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath('//h3')[0].text.strip()
            title = re.match(r"^\w+\s+\d+:\s+(.*)$", title).group(1)

            bill_id = page.xpath("string(//pre[@class='billhistory']/b)")
            bill_id = bill_id.split()[0].strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            hist = page.xpath("string(//pre[@class='billhistory'])").strip()
            act_re = re.compile(r'^  (\d\d/\d\d/\d\d) (SENATE|HOUSE)'
                                r'(.*\n(\s{16,16}.*\n){0,})',
                                re.MULTILINE)

            # Actions
            for match in act_re.finditer(hist):
                action = match.group(3).replace('\n', ' ')
                action = re.sub(r'\s+', ' ', action).strip()

                if match.group(2) == 'SENATE':
                    actor = 'upper'
                else:
                    actor = 'lower'

                date = match.group(1)
                date = datetime.datetime.strptime(date, "%m/%d/%y")

                for act_text in re.split(' -[HS]J \d+;? ?', action):
                    act_text = act_text.strip()
                    if not act_text:
                        continue

                    types = []
                    act_lower = act_text.lower()
                    if act_lower.startswith('introduced'):
                        types.append('bill:introduced')
                    if 'referred to' in act_lower:
                        types.append('committee:referred')
                    if 'died in committee' in act_lower:
                        types.append('committee:failed')
                    if 'favorable by' in act_lower:
                        types.append('committee:passed:favorable')
                    if 'amendment(s) adopted' in act_lower:
                        types.append('amendment:passed')

                    bill.add_action(actor, act_text, date, type=types)

            # Sponsors
            primary_sponsor = re.search(r'by ([^;(\n]+;?|\w+)',
                                        hist).group(1).strip('; ')
            bill.add_sponsor('primary', primary_sponsor)

            cospon_re = re.compile(r'\((CO-SPONSORS|CO-AUTHORS)\) '
                                   '([\w .]+(;[\w .\n]+){0,})',
                                   re.MULTILINE)
            match = cospon_re.search(hist)

            if match:
                for cosponsor in match.group(2).split(';'):
                    cosponsor = cosponsor.replace('\n', '').strip()
                    bill.add_sponsor('cosponsor', cosponsor)

            # Versions
            for link in page.xpath("//a[contains(@href, 'billtext/html')]"):
                version = link.xpath('string(../../td[1])').strip()

                bill.add_version(version, link.attrib['href'])

            # House Votes
            for link in page.xpath("//a[contains(@href, 'votes/html/h')]"):
                bill.add_vote(self.scrape_lower_vote(link.attrib['href']))

            # Senate Votes
            for link in page.xpath("//a[contains(@href, 'votes/html/S')]"):
                bill.add_vote(self.scrape_upper_vote(link.attrib['href']))

            self.save_bill(bill)
Beispiel #24
0
    def scrape_session_new(self, chamber, session):
        if chamber == "lower":
            bill_abbr = "H."
        else:
            bill_abbr = "S."

        bill_list_path = "docs/bills.cfm?Session=%s&Body=%s" % (
            session.split('-')[1], bill_abbr[0])
        bill_list_url = "http://www.leg.state.vt.us/" + bill_list_path
        bill_list = BeautifulSoup(self.urlopen(bill_list_url))

        bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0])
        for bill_link in bill_list.findAll('a', href=bill_link_re):
            bill_id = bill_link.string
            bill_title = bill_link.parent.findNext('b').string
            bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_source(bill_info_url)

            info_page = BeautifulSoup(self.urlopen(bill_info_url))

            text_links = info_page.findAll('blockquote')[1].findAll('a')
            for text_link in text_links:
                bill.add_version(text_link.string,
                                 "http://www.leg.state.vt.us" +
                                 text_link['href'])

            act_table = info_page.findAll('blockquote')[2].table
            for row in act_table.findAll('tr')[1:]:
                action = ""
                for s in row.findAll('td')[1].findAll(text=True):
                    action += s + " "
                action = clean_action(action)

                match = re.search('Governor on (.*)$', action)
                if match:
                    act_date = parse_exec_date(match.group(1).strip())
                    actor = 'Governor'
                else:
                    if row['bgcolor'] == 'Salmon':
                        actor = 'lower'
                    else:
                        actor = 'upper'

                    if row.td.a:
                        act_date = row.td.a.string
                    else:
                        act_date = row.td.string

                    try:
                        act_date = re.search(
                            '\d{1,2}/\d{1,2}/\d{4,4}', act_date).group(0)
                    except AttributeError:
                        # No date, skip
                        continue

                    act_date = dt.datetime.strptime(act_date, '%m/%d/%Y')

                bill.add_action(actor, action, act_date,
                                type=action_type(action))

                vote_link = row.find('a', text='Details')
                if vote_link:
                    vote_url = vote_link.parent['href']
                    self.parse_vote_new(bill, actor, vote_url)

            sponsors = info_page.find(
                text='Sponsor(s):').parent.parent.findAll('b')
            bill.add_sponsor('primary', sponsors[0].string)
            for sponsor in sponsors[1:]:
                bill.add_sponsor('cosponsor', sponsor.string)

            self.save_bill(bill)
Beispiel #25
0
 def scrape_bill(self, chamber, session, bill_id):
     """
     Scrapes documents, actions, vote counts and votes for 
     a given bill.
     """
     session_id = self.get_session_id(session)
     url = BASE_URL + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % (
                                        bill_id.replace(' ', ''), session_id)
     with self.urlopen(url) as docs_for_bill:
         root = html.fromstring(docs_for_bill)
         bill_title = root.xpath(
                         '//div[@class="ContentPageTitle"]')[1].text.strip()
         b_type = utils.get_bill_type(bill_id)
         bill = Bill(session, chamber, bill_id, bill_title, type=b_type)
         bill.add_source(url)
         path = '//tr[contains(td/font/text(), "%s")]'
         link_path = '//tr[contains(td/a/@href, "%s")]'
         link_path2 = '//tr[contains(td/font/a/@href, "%s")]'
         # versions
         rows = root.xpath(path % 'd Version')
         for row in rows:
             tds = row.cssselect('td')
             bill_version = tds[1].text_content().strip()
             bill_html = tds[2].xpath('string(font/a/@href)')
             bill.add_version(bill_version, bill_html)
                                         
         #fact sheets and summary
         rows = root.xpath(link_path2 % '/summary/')
         for row in rows:
             tds = row.cssselect('td')
             fact_sheet = tds[1].text_content().strip()
             fact_sheet_url = tds[1].xpath('string(font/a/@href)')
             bill.add_document(fact_sheet, fact_sheet_url, type="summary")
                 
         #agendas
         # skipping revised, cancelled, date, time and room from agendas
         # but how to get the agenda type cleanly? meaning whether it is 
         # house or senate?
         rows = root.xpath(link_path % '/agendas')
         for row in rows:
             tds = row.cssselect('td')
             agenda_committee = tds[0].text_content().strip()
             agenda_html = tds[7].xpath('string(a/@href)').strip()
             if agenda_html == '':
                 agenda_html = tds[6].xpath('string(a/@href)').strip()
             bill.add_document(agenda_committee, agenda_html)
             
         # House Calendars
         # skipping calendar number, modified, date
         rows = root.xpath(link_path % '/calendar/h')
         for row in rows:
             tds = row.cssselect('td')
             calendar_name = tds[0].text_content().strip()
             calendar_html = tds[5].xpath('string(a/@href)')
             bill.add_document(calendar_name, calendar_html, 
                               type='house calendar')
         # Senate Calendars
         # skipping calendar number, modified, date
         rows = root.xpath(link_path % '/calendar/s')
         for row in rows:
             tds = row.cssselect('td')
             calendar_name = tds[0].text_content().strip()
             calendar_html = tds[5].xpath('string(a/@href)')
             bill.add_document(calendar_name, calendar_html, 
                               type='senate calendar')
         # amendments
         rows = root.xpath(path % 'AMENDMENT:')
         for row in rows:
             tds = row.cssselect('td')
             amendment_title = tds[1].text_content().strip()
             amendment_link = tds[2].xpath('string(font/a/@href)')
             bill.add_document(amendment_title, amendment_link, 
                               type='amendment')
         
         # videos
         # http://azleg.granicus.com/MediaPlayer.php?view_id=13&clip_id=7684
         rows = root.xpath(link_path % '&clip_id')
         for row in rows:
             tds = row.cssselect('td')
             video_title = tds[1].text_content().strip()
             video_link = tds[2].xpath('string(a/@href)')
             video_date = tds[0].text_content().strip()
             bill.add_document(video_title, video_link, date=video_date, 
                               type='video')
             
     self.scrape_actions(chamber, session, bill)    
Beispiel #26
0
    def scrape_session_old(self, chamber, session):
        if chamber == "lower":
            bill_abbr = "H."
            chamber_name = "House"
            other_chamber = "Senate"
        else:
            bill_abbr = "S."
            chamber_name = "Senate"
            other_chamber = "House"

        start_date = '1/1/%s' % session.split('-')[0]
        data = urllib.urlencode({'Date': start_date,
                                 'Body': bill_abbr[0],
                                 'Session': session.split('-')[1]})
        bill_list_url = "http://www.leg.state.vt.us/database/"\
            "rintro/results.cfm"
        bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data))

        bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0])
        for bill_link in bill_list.findAll('a', href=bill_link_re):
            bill_id = bill_link.string
            bill_title = bill_link.parent.parent.findAll('td')[1].string
            bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_source(bill_info_url)

            info_page = BeautifulSoup(self.urlopen(bill_info_url))

            text_links = info_page.findAll('blockquote')[-1].findAll('a')
            for text_link in text_links:
                bill.add_version(text_link.string,
                                 "http://www.leg.state.vt.us" +
                                 text_link['href'])

            sponsors = info_page.find(
                text='Sponsor(s):').parent.findNext('td').findAll('b')
            bill.add_sponsor('primary', sponsors[0].string)
            for sponsor in sponsors[1:]:
                bill.add_sponsor('cosponsor', sponsor.string)

            # Grab actions from the originating chamber
            act_table = info_page.find(
                text='%s Status:' % chamber_name).findNext('table')
            for row in act_table.findAll('tr')[3:]:
                action = clean_action(row.td.string.replace(
                        '&nbsp;', '').strip(':'))

                act_date = row.findAll('td')[1].b.string.replace('&nbsp;', '')
                if act_date != "":
                    detail = row.findAll('td')[2].b
                    if detail and detail.string != "":
                        action += ": %s" % detail.string.replace('&nbsp;', '')
                    bill.add_action(chamber, action, act_date,
                                    type=action_type(action))

            # Grab actions from the other chamber
            act_table = info_page.find(
                text='%s Status:' % other_chamber).findNext('table')
            if act_table:
                if chamber == 'upper':
                    act_chamber = 'lower'
                else:
                    act_chamber = 'upper'
                for row in act_table.findAll('tr')[3:]:
                    action = clean_action(row.td.string.replace(
                            '&nbsp;', '').strip(':'))

                    act_date = row.findAll('td')[1].b.string.replace(
                        '&nbsp;', '')
                    if act_date != "":
                        detail = row.findAll('td')[2].b
                        if detail and detail.string != "":
                            action += ": %s" % detail.string.replace(
                                '&nbsp;', '')
                        date = dt.datetime.strptime(act_date, '%m/%d/%Y')
                        bill.add_action(act_chamber, action, act_date,
                                        type=action_type(action))

            self.save_bill(bill)
Beispiel #27
0
    def scrape_session(self, chamber, year):
        if chamber == 'upper':
            bill_abbr = 'SB|SCR|SJR'
        elif chamber == 'lower':
            bill_abbr = 'HB|HCR|HJR'

        # Sessions last 2 years, 1993-1994 was the 18th
        session = str(18 + ((int(year) - 1993) / 2))
        year2 = str(int(year) + 1)

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year2[2:]

        # Get bill list
        bill_list_url = 'http://www.legis.state.ak.us/'\
            'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % (
            session, date1, date2)
        self.log("Getting bill list for %s %s (this may take a long time)." %
                 (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(' ', '')
            bill_name = link.parent.parent.findNext('td').find(
                'font').contents[0].strip()
            bill = Bill(session, chamber, bill_id, bill_name.strip())

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link['href']
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(
                text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(
                ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
                spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(',')
                bill.add_sponsor('primary', sponsors[0].strip())

                for sponsor in sponsors[1:]:
                    bill.add_sponsor('cosponsor', sponsor.strip())
            else:
                # Committee sponsorship
                bill.add_sponsor('committee', spons_str.strip())

            # Get actions
            act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:]
            for row in act_rows:
                cols = row.findAll('td')
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, '%m/%d/%y')

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    try:
                        vote = self.parse_vote(bill, action,
                                               act_chamber, act_date,
                                               cols[1].a['href'])
                        bill.add_vote(vote)
                    except:
                        self.log("Failed parsing vote at %s" %
                                 cols[1].a['href'])

                bill.add_action(act_chamber, action, act_date)

            # Get subjects
            bill['subjects'] = []
            subject_link_re = re.compile('.*subject=\w+$')
            for subject_link in info_page.findAll('a', href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill['subjects'].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session, bill_id)
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile('^get_bill_text?')
            for text_link in text_list.findAll('a', href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (
                    text_link['href'])

                bill.add_version(text_name, text_url)

            self.save_bill(bill)
Beispiel #28
0
 def scrape(self, chamber, session):    
     bill_search_url = 'http://www.camaraderepresentantes.org/cr_buscar.asp'
     bill_types = {'Project':'P', 'Resolution':'R', \
                          'Joint Resolution':'RC', \
                          'Concurrent Resolution':'RK', \
                          'Appointment':'N'}
     #bodies = {'upper':'S', 'lower':'C'}
     bodies = {'upper':'S'}
     
     bill_search_page = lxml.html.parse(bill_search_url).getroot()
     search_form = bill_search_page.forms[0]
     
     for body in bodies.itervalues(): 
         for bill_type in bill_types.itervalues():   
             search_form.fields['cuerpo'] = body
             search_form.fields['tipo'] = bill_type
             search_form.fields['autor'] = 'NA'
             
             if year_from_session(session) == '2009':
                 search_form.fields['f2'] = '12/31/2009'
             elif year_from_session(session) == '2010':
                 search_form.fields['f1'] = '01/01/2010'
             
             result = lxml.html.parse(lxml.html.submit_form(search_form)).getroot()
             table_elements = result.cssselect('table')
             table_elements.pop()
             bill_elements = grouper(3, table_elements)
             
             for actions, complete_data, bill_data in bill_elements:
                 td_elements = bill_data.cssselect('td')  
                 title = td_elements[1].text_content()
                 date = td_elements[3].text_content()
                 description = td_elements[5].text_content()
                 authors = td_elements[7].text_content().split('/')
                 
                 bill = Bill(session, chamber, title, description)
                 
                 for author in authors:
                     if len(authors) == 1:
                         bill.add_sponsor('primary', author)
                     else:
                         bill.add_sponsor('cosponsor', author)
      
                 td_elements = actions.cssselect('td')
                 td_elements = td_elements[4:-1]                   
                 action_elements = grouper(3, td_elements)
                 
                 for date_element, action, empty in action_elements:
                     # Clean unicode character
                     date_text = date_element.text_content().replace(u'\xa0',u'')
                     date = dt.datetime.strptime(date_text, '%m/%d/%Y')
                     action_text = action.text_content()
                     try:
                         doc_link_part = action.iterlinks().next()[2]
                         if 'voto' in doc_link_part:
                             raise                           
                         doc_link = doc_link_url(doc_link_part)
                         bill.add_version((action_text, doc_link))
                     except:
                         pass
                 
                     bill.add_action(chamber, action_text, date)
Beispiel #29
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            doc_name = self._doctypes[rec['doctype']]
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        #Senate Votes
        file1 = 'A' + str(year_abr)
        file2 = 'A' + str(year_abr + 1)
        file3 = 'S' + str(year_abr)
        file4 = 'S' + str(year_abr + 1)
        if str(year_abr) != '2010':
            vote_info_list = [file1, file2, file3, file4]
        else:
            vote_info_list = [file1, file3]
        for bill_vote_file in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % bill_vote_file
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if bill_vote_file[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            for rec in vdict_file:
                bill_id = rec["Bill"]
                bill_id = bill_id.strip()
                leg = rec["Full_Name"]

                date = rec["Session_Date"]
                date = datetime.strptime(date, "%m/%d/%Y")
                action = rec["Action"]
                leg_vote = rec["Legislator_Vote"]
                vote_id = bill_id + "_" + action
                vote_id = vote_id.replace(" ", "_")
                passed = None

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, passed, None,
                                          None, None, bill_id=bill_id)
                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')


        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Beispiel #30
0
    def scrape_year(self, chamber, year, session):
        if chamber == 'upper':
            chamber_abbr = 'H'
        elif chamber == 'lower':
            chamber_abbr = 'S'

        #set up POST data
        values = [('txtsessionyear', year),
                  ('txttitle', ''),
                  ('txtlsrnumber', ''),
                  ('Submit1', 'Submit')]
        params = urllib.urlencode(values)
        search_url = 'http://www.gencourt.state.nh.us/bill_status/Results.aspx'

        #request page with list of all bills in year
        with self.urlopen(search_url + '?' + params) as doc:
            soup = BeautifulSoup(doc)

            #parse results
            bills = soup.find("table", {"class": "ptable"})
            trs = soup.findAll("tr")
            #go through all of the table rows with relevant data
            tr_start = 8
            tr_hop = 11
            i = 0

            while (tr_start + (tr_hop * i)) < len(trs):
                tr = trs[tr_start + (tr_hop * i)]
                i = i + 1
                # strip off extra white space from name
                id = tr.find("big").string.strip()
                bill_id = tr.find("big").string.strip()
                exp = re.compile("^(\w*)")
                bill_id = exp.search(id).group(1)

                # check to see if its in the proper chamber
                exp = re.compile("^" + chamber_abbr)
                if exp.search(bill_id) == None:
                    continue  # in wrong house

                # check to see it is a bill and not a resolution
                exp = re.compile("B")
                if exp.search(bill_id) == None:
                    continue  # not a bill

                # get bill_id suffix if exists
                exp = re.compile("(-\w*)$")
                res = exp.search(id)
                if res != None:
                    bill_id = bill_id + res.group(1)

                # get bill title
                title = tr.findAll("b")[0]
                bill_title = title.nextSibling.string
                bill_title = bill_title.strip()
                bill_title = bill_title.encode('ascii', 'xmlcharrefreplace')

                # grab url of bill text
                urls = tr.findAll("a")
                textexp = re.compile("Bill Text")
                textdoc = re.compile("Bill Docket")
                textstat = re.compile("Bill Status")
                textcall = re.compile("Roll Calls")
                textaudio = re.compile("Audio Files")
                for url in urls:
                    if textexp.search(str(url.string)) != None:
                        bill_url = self.get_bill_text(url)
                    if textdoc.search(str(url.string)) != None:
                        pass
                    if textstat.search(str(url.string)) != None:
                        add_bill_sponsors()
                    if textcall.search(str(url.string)) != None:
                        pass
                    if textaudio.search(str(url.string)) != None:
                        pass

                bill = Bill(session, chamber, bill_id, bill_title)
                bill.add_version("Bill text", bill_url)
                bill.add_source(search_url)
                self.save_bill(bill)