Esempio n. 1
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution'}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                    if insert.find('Special') != -1:
                        session = insert
                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type)
                    bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)


                    primary, secondary = self.scrape_sponsors(page)
                    
                    if primary[0] == 'By:':
                        primary.pop(0)
                        
                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    minutes_count = 2
                    for mr in root.xpath('//table[4]/tr/td[3]/a'):
                        minutes =  mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1


                    self.scrape_actions(root, bill, "lower")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Esempio n. 2
0
    def scrape_bill(self, session, chamber, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                return
            title = title.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Esempio n. 3
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        bill_type = re.search('type=(B|R|)', link.attrib['href']).group(1)
        bill_id = "%s%s %s" % (bill_abbr(chamber), bill_type, bill_num)

        url = info_url(chamber, session, special, bill_type, bill_num)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath(
                "//td[text() = 'Short Title:']/following-sibling::td")[0]
            title = title.text.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            self.parse_bill_versions(bill, page)

            self.parse_history(bill, history_url(chamber, session, special,
                                                 bill_type, bill_num))

            self.parse_votes(bill, vote_url(chamber, session, special,
                                            bill_type, bill_num))

            self.save_bill(bill)
Esempio n. 4
0
 def scrape_session_2009(self, chamber, session):
     url, type = bills_url(chamber)
                 
     with self.urlopen(url) as page_html:
         page = lxml.html.fromstring(page_html)
         for element, attribute, link, pos in page.iterlinks():         
             if re.search("billtype=" + type + "&billnumber=[0-9]+", link) != None:
                 bill_page_url = bill_url(link)
                 with self.urlopen(bill_page_url) as bill_page_str:
                     bill_page = lxml.html.fromstring(bill_page_str)
                     splitted_link = link.split("=")
                     bill_number = splitted_link[-1]
                     bill_id = bill_page.cssselect('a[class="headerlink"]')
                     bill_id = bill_id[0]
                     bill_id = bill_id.text_content()
                     bill_title = bill_page.cssselect('td[style="color:Black"]')
                     bill_title = bill_title[0]
                     bill_title = bill_title.text_content()
                     bill = Bill(session, chamber, bill_id, bill_title)
                     bill.add_source(bill_page_url)
                     
                     actions_table_list = bill_page.cssselect('table[rules="all"]')
                     actions_table = actions_table_list[0]
                     action_elements = actions_table.cssselect('tr')
                     # first element is not an action element
                     action_elements.pop(0)
                     
                     for ae in action_elements:
                         action_element_parts = ae.cssselect('td')
                         
                         action_date = action_element_parts[0]
                         actor_house = action_element_parts[1]
                         action_text = action_element_parts[2]
                         
                         # look for acting comittees
                         match = re.search("(committee\(s\)|committee) on ([A-Z]{3}(/|-)[A-Z]{3}|[A-Z]{3})", action_text.text_content())
 
                         if(match != None):
                             actor = match.group(0)
                         elif(actor_house == 'H'):
                             actor = "lower"
                         elif (actor_house == 'S'):
                             actor = "upper"
                         else:
                             actor = chamber
                         
                         action_date = dt.datetime.strptime(action_date.text_content(), '%m/%d/%Y') 
                             
                         if (re.search("The votes were as follows", action_text.text_content()) != None):
                             self.scrape_votes(action_text.text_content(), bill_page_url, actor_house, action_date, bill)
                                                        
                         bill.add_action(actor, action_text, action_date)
                     
                     with self.urlopen(versions_page_url(type, bill_number)) as versions_page_html:
                         versions_page = lxml.html.fromstring(versions_page_html)
                         versions_elements = versions_page.cssselect('span[class="searchtitle"]')
                         for ve in versions_elements:
                             element_text = ve.text_content()
                             version_name = element_text.rstrip("_.HTM")
                             bill.add_version(version_name, bill_version_url(element_text))
Esempio n. 5
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Esempio n. 6
0
    def scrape_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            header = page.xpath('//h3/br')[0].tail.replace(' ', ' ')
            title, primary_sponsor = header.split(' -- ')

            if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
                bill_type = ['bill']
            elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
                bill_type = ['concurrent resolution']
            elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
                bill_type = ['joint resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_sponsor('primary', primary_sponsor)
            bill.add_source(url)

            status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
            self.parse_status(bill, status_link.attrib['href'])

            for link in page.xpath(
                '//a[contains(@href, "bills/") and text() = "HTML"]'):

                name = link.getprevious().tail.strip()
                bill.add_version(name, link.attrib['href'])

            self.save_bill(bill)
Esempio n. 7
0
 def scrape(self, chamber, session):   
     year = year_from_session(session)
     url = bills_url(year)
     with self.urlopen(url) as bills_page_html:
         bills_page = lxml.html.fromstring(bills_page_html)
         table_rows = bills_page.cssselect('tr')
         # Eliminate empty rows
         table_rows = table_rows[0:len(table_rows):2]
         for row in table_rows:
             row_elements = row.cssselect('td')
             
             bill_document = row_elements[0]
             bill_document.make_links_absolute(base_url())
             
             element, attribute, link, pos = bill_document.iterlinks().next()
             bill_id = element.text_content().rstrip('.pdf')
             bill_document_link = link           
             
             title_and_sponsors = row_elements[1]
             title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content())
             sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content())
             title = title_match.group(1)
             sponsors =  sponsors_match.group(1)
             separated_sponsors = sponsors.split('--')
             
             bill = Bill(session, chamber, bill_id, title)
             bill.add_version('current', bill_document_link)
             
             if separated_sponsors[1] == '(NONE)':
                 bill.add_sponsor('primary', separated_sponsors[0])
             
             else:
                 bill.add_sponsor('cosponsor', separated_sponsors[0])
                 bill.add_sponsor('cosponsor', separated_sponsors[1])                
            
             
             versions_page_element = row_elements[2]
             versions_page_element.make_links_absolute(base_url())
             element, attribute, link, pos = versions_page_element.iterlinks().next()
             
             bill.add_source(link)
             
             self.scrape_versions(link, bill)
                   
             actions_page_element = row_elements[3]
             element, attribute, link, pos = actions_page_element.iterlinks().next()
             frame_link = base_url() + link.split('?Open&target=')[1]
             
             self.scrape_actions(frame_link, bill)
             
             votes_page_element = row_elements[7]
             element, attribute, link, pos = votes_page_element.iterlinks().next()
             frame_link = base_url() + link.split('?Open&target=')[1]
             self.scrape_votes(frame_link, chamber, bill)
                             
                     
                        
                     
                 
             
Esempio n. 8
0
    def scrape_bill(self, chamber, session, bill_type, number):
        """ Creates a bill object
        """
        if len(session) == 4:
            session_url = session+'rs'
        else:
            session_url = session
        url = BILL_URL % (session_url, bill_type, number)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # title
            # find <a name="Title">, get parent dt, get parent dl, then get dd within dl
            title = doc.cssselect('a[name=Title]')[0] \
                .getparent().getparent().cssselect('dd')[0].text.strip()

            # create the bill object now that we have the title
            print "%s %d %s" % (bill_type, number, title)
            bill = Bill(session, chamber, "%s %d" % (bill_type, number), title)
            bill.add_source(url)

            self.parse_bill_sponsors(doc, bill)     # sponsors
            self.parse_bill_actions(doc, bill)      # actions
            self.parse_bill_documents(doc, bill)    # documents and versions
            self.parse_bill_votes(doc, bill)        # votes

            # add bill to collection
            self.save_bill(bill)
Esempio n. 9
0
    def parse_senate_billpage(self, bill_url, year):
        with self.urlopen(bill_url) as bill_page:
            bill_page = BeautifulSoup(bill_page)
            # get all the info needed to record the bill
            bill_id = bill_page.find(id="lblBillNum").b.font.contents[0]
            bill_title = bill_page.find(id="lblBillTitle").font.string
            bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0]
            bill_lr = bill_page.find(id="lblLRNum").font.string

            bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url,
                        bill_lr=bill_lr, official_title=bill_title)
            bill.add_source(bill_url)

            # Get the primary sponsor
            bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0]
            bill_sponsor_link = bill_page.find(id="hlSponsor").href
            bill.add_sponsor('primary', bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # cosponsors show up on their own page, if they exist
            cosponsor_tag = bill_page.find(id="hlCoSponsors")
            if cosponsor_tag and 'href' in cosponsor_tag:
                self.parse_senate_cosponsors(bill, cosponsor_tag['href'])

            # get the actions
            action_url = bill_page.find(id="hlAllActions")['href']
            self.parse_senate_actions(bill, action_url)

            # stored on a separate page
            versions_url = bill_page.find(id="hlFullBillText")
            if versions_url:
                self.parse_senate_bill_versions(bill, versions_url['href'])

        self.save_bill(bill)
Esempio n. 10
0
    def scrape_bill(self, bill_url, chamber, session):
        with self.urlopen(bill_url) as text:
            if "Specified Bill could not be found" in text:
                return False
            page = lxml.html.fromstring(text)
            page.make_links_absolute(bill_url)

            bill_id = page.xpath("string(//h2)").split()[0]

            summary = page.xpath(
                "string(//*[starts-with(text(), 'Summary: ')])")
            summary = summary.replace('Summary: ', '')

            match = re.match(r"^([^:]+): ([^(]+)", summary)

            if match:
                subjects = [match.group(1).strip()]
                title = match.group(2).strip()
            else:
                raise ScrapeError("Bad title")

            bill = Bill(session, chamber, bill_id, title,
                        subjects=subjects)
            bill.add_source(bill_url)

            history_link = page.xpath("//a[text() = 'History']")[0]
            history_url = history_link.attrib['href']
            self.scrape_history(bill, history_url)

            authors_link = page.xpath("//a[text() = 'Authors']")[0]
            authors_url = authors_link.attrib['href']
            self.scrape_authors(bill, authors_url)

            try:
                versions_link = page.xpath(
                    "//a[text() = 'Text - All Versions']")[0]
                versions_url = versions_link.attrib['href']
                self.scrape_versions(bill, versions_url)
            except IndexError:
                # Only current version
                try:
                    version_link = page.xpath(
                        "//a[text() = 'Text - Current']")[0]
                    version_url = version_link.attrib['href']
                    bill.add_version("%s Current" % bill_id, version_url)
                except IndexError:
                    # Some bills don't have any versions :(
                    pass

            try:
                votes_link = page.xpath("//a[text() = 'Votes']")[0]
                self.scrape_votes(bill, votes_link.attrib['href'])
            except IndexError:
                # Some bills don't have any votes
                pass

            self.save_bill(bill)

            return True
Esempio n. 11
0
 def parse_special_session_bill_status_page(self, bill_id, status_page, bill_table, session, chamber, sources):
     title = bill_table.xpath('//tr[3]/td[2]')[0].text_content()
     bill = Bill(session, chamber, bill_id, title)
     for source in sources:
         bill.add_source(source)
     self.add_sponsors(bill, self.get_sponsor_table(status_page))
     self.add_actions(bill, self.get_action_table(status_page))
     return bill
Esempio n. 12
0
    def scrape(self, chamber, year):
        # Data prior to 1997 is contained in pdfs
        if year < "1997":
            raise NoDataForYear(year)

        bills_url = "http://www.leg.state.co.us/CLICS/CLICS" + year + "A/csl.nsf/%28bf-1%29?OpenView&Count=2000"
        with self.lxml_context(bills_url) as bills_page:
            table_rows = bills_page.cssselect("tr")
            # Eliminate empty rows
            table_rows = table_rows[0 : len(table_rows) : 2]
            for row in table_rows:
                print "row"
                row_elements = row.cssselect("td")

                bill_document = row_elements[0]
                bill_document.make_links_absolute("http://www.leg.state.co.us")

                element, attribute, link, pos = bill_document.iterlinks().next()
                bill_id = element.text_content().rstrip(".pdf")
                bill_document_link = link

                title_and_sponsors = row_elements[1]
                title_match = re.search("([A-Z][a-z]+.+[a-z])[A-Z]", title_and_sponsors.text_content())
                sponsors_match = re.search("[a-z]([A-Z]+.+)", title_and_sponsors.text_content())
                title = title_match.group(1)
                sponsors = sponsors_match.group(1)
                separated_sponsors = sponsors.split("--")

                bill = Bill(year, chamber, bill_id, title)
                bill.add_version("current", bill_document_link)

                if separated_sponsors[1] == "(NONE)":
                    bill.add_sponsor("primary", separated_sponsors[0])

                else:
                    bill.add_sponsor("cosponsor", separated_sponsors[0])
                    bill.add_sponsor("cosponsor", separated_sponsors[1])

                versions_page_element = row_elements[2]
                versions_page_element.make_links_absolute("http://www.leg.state.co.us")
                element, attribute, link, pos = versions_page_element.iterlinks().next()

                bill.add_source(link)

                self.scrape_versions(link, bill)

                actions_page_element = row_elements[3]
                element, attribute, link, pos = actions_page_element.iterlinks().next()
                frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1]

                self.scrape_actions(frame_link, bill)

                votes_page_element = row_elements[7]
                element, attribute, link, pos = votes_page_element.iterlinks().next()
                frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1]

                self.scrape_votes(link, chamber, bill)
Esempio n. 13
0
    def scrape_year(self, year, chamber):    
        
        sep = '<h1>House</h1>'
            
        if chamber == 'upper':
            after = False
            reg = '[5-9]'
        else:
            after = True
            reg = '[1-4]'
                
        with self.lxml_context("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + str(year), sep, after) as page:
            for element, attribute, link, pos in page.iterlinks():
                if re.search("bill=" + reg + "[0-9]{3}", link) != None:
                    bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link
                    with self.lxml_context(bill_page_url) as bill_page:
                        raw_title = bill_page.cssselect('title')
                        split_title = string.split(raw_title[0].text_content(), ' ')
                        bill_id = split_title[0] + ' ' + split_title[1]
                        bill_id = bill_id.strip()
                        session = split_title[3].strip()

                        title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle")
                        title = title_element.text_content()

                        bill = Bill(session, chamber, bill_id, title)
                        bill.add_source(bill_page_url)
                
                        self.scrape_actions(bill_page, bill)
                
                        for element, attribute, link, pos in bill_page.iterlinks():
                            if re.search("billdocs", link) != None:
                                if re.search("Amendments", link) != None:
                                    bill.add_document("Amendment: " + element.text_content(), link)    
                                elif re.search("Bills", link) != None:
                                    bill.add_version(element.text_content(), link) 
                                else:
                                    bill.add_document(element.text_content(), link)
                            elif re.search("senators|representatives", link) != None:
                                with self.lxml_context(link) as senator_page:
                                    try:
                                        name_tuple = self.scrape_legislator_name(senator_page)
                                        bill.add_sponsor('primary', name_tuple[0])
                                    except:
                                        pass
                            elif re.search("ShowRollCall", link) != None:
                                match = re.search("([0-9]+,[0-9]+)", link)
                                match = match.group(0)
                                match = match.split(',')
                                id1 = match[0]
                                id2 = match[1]
                                url = "http://flooractivityext.leg.wa.gov/rollcall.aspx?id=" + id1 + "&bienId=" +id2
                                with self.lxml_context(url) as vote_page:
                                    self.scrape_votes(vote_page, bill, url)
                                    
                        self.save_bill(bill)
Esempio n. 14
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == "lower":
            bill_abbr = "HB"
        else:
            bill_abbr = "SB"

        bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % (
            session.replace(' ', ''))
        self.log("Getting bill list for %s, %s" % (session, chamber))

        try:
            base_bill_list = self.soup_parser(self.urlopen(bill_list_url))
        except:
            # this session doesn't exist for this year
            return

        bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr)

        for link in base_bill_list.findAll('a', href=bill_list_link_re):
            bill_list = self.soup_parser(self.urlopen(link['href']))
            bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr)

            for bill_link in bill_list.findAll('a', href=bill_link_re):
                bill_id = bill_link.find(text=True).strip()

                bill_info_url = bill_link['href']
                bill_info = self.soup_parser(self.urlopen(bill_info_url))

                bill_title, primary_sponsor = bill_info.h3.contents[2].replace(
                    '&nbsp;', ' ').strip().split(' -- ')

                bill = Bill(session, chamber, bill_id, bill_title)
                bill.add_source(bill_info_url)
                bill.add_sponsor('primary', primary_sponsor)

                status_re = re.compile('.*billsta/%s.*.htm' %
                                       bill_abbr.lower())
                status_link = bill_info.find('a', href=status_re)

                if status_link:
                    self.parse_status(bill, status_link['href'])

                text_find = bill_info.find(
                    text="Bill Text (If you are having trouble viewing")

                if text_find:
                    text_link_re = re.compile('.*\.htm')
                    for text_link in text_find.parent.parent.findAll(
                        'a', href=text_link_re)[1:]:
                        version_name = text_link.previous.strip()
                        bill.add_version(version_name, text_link['href'])

                self.save_bill(bill)
Esempio n. 15
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = '%s/bill.doc' % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext('p').contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version",
                             session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile(
                    'legislator/[SH]\d+\.htm'))

            for sponsor_link in sponsor_links:
                bill.add_sponsor('primary', sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext('p')[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = action.split('-')[0]
                action_date = dt.datetime.strptime(action_date, '%b %d')
                # Fix:
                action_date = action_date.replace(
                    year=int('20' + session[2:4]))

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
            if vote_link:
                bill.add_document(
                    'vote_history.pdf',
                    bill_info_url.replace('.htm', '') + "/vote_history.pdf")

            self.save_bill(bill)
Esempio n. 16
0
    def parse_standard_bill_status_page(self, bill_id, status_page, session, chamber, sources):
        try:
            title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content()
        except IndexError:
            if len(status_page.xpath("/html/html")) == 2:
                title = status_page.xpath('/html/html[2]/tr[1]/td[2]')[0].text_content()
            else:
                title = status_page.xpath('/html/html[3]/tr[1]/td[2]')[0].text_content()

        bill = Bill(session, chamber, bill_id, title)
        for source in sources:
            bill.add_source(source)
        self.add_sponsors(bill, self.get_sponsor_table(status_page))
        self.add_actions(bill, self.get_action_table(status_page))

        return bill
Esempio n. 17
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {'F': 'bill', 'R':'resolution',
                         'C': 'concurrent resolution'}[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip())

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'])

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            version_doc = lxml.html.fromstring(version_html)
            for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'):
                version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href'))
                bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)
Esempio n. 18
0
    def scrape(self, chamber, session):
        # internal id for the session, store on self so all methods have access
        self.site_id = self.metadata['session_details'][session]['site_id']

        self.build_subject_map()

        # used for skipping bills from opposite chamber
        start_letter = 'H' if chamber == 'lower' else 'S'

        url = 'http://leg6.state.va.us/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id

        while url:
            with self.urlopen(url) as html:
                doc = lxml.html.fromstring(html)

                url = None  # no more unless we encounter 'More...'

                bills = doc.xpath('//ul[@class="linkSect"]/li')
                for bill in bills:
                    link = bill.getchildren()[0]
                    bill_id = link.text_content()

                    # check if this is the 'More...' link
                    if bill_id == 'More...':
                        url = BASE_URL + link.get('href')

                    # skip bills from the other chamber
                    elif not bill_id.startswith(start_letter):
                        continue

                    else:
                        # create a bill
                        desc = bill.xpath('text()')[0].strip()
                        bill_type = {'B': 'bill',
                                     'J': 'joint resolution',
                                     'R': 'resolution'}[bill_id[1]]
                        bill = Bill(session, chamber, bill_id, desc,
                                    type=bill_type)

                        bill_url = BASE_URL + link.get('href')
                        self.fetch_sponsors(bill)
                        self.scrape_bill_details(bill_url, bill)
                        bill['subjects'] = self.subject_map[bill_id]
                        bill.add_source(bill_url)
                        self.save_bill(bill)
Esempio n. 19
0
    def parse_bill_status_page(self, status_url, bill_url, session, chamber):
        status_page = ElementTree(lxml.html.fromstring(self.urlopen(status_url)))
        # see 2007 HB 2... weird.
        try:
            bill_id = status_page.xpath("/div/form[1]/table[2]/tr[2]/td[2]")[0].text_content()
        except IndexError:
            bill_id = status_page.xpath("/html/html[2]/tr[1]/td[2]")[0].text_content()

        try:
            title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content()
        except IndexError:
            title = status_page.xpath("/html/html[3]/tr[1]/td[2]")[0].text_content()

        bill = Bill(session, chamber, bill_id, title)
        bill.add_source(bill_url)

        self.add_sponsors(bill, status_page)
        self.add_actions(bill, status_page)

        return bill
Esempio n. 20
0
    def scrape_bill(self, chamber, session, bill_type, number):
        """ Creates a bill object
        """
        if len(session) == 4:
            session_url = session+'rs'
        else:
            session_url = session
        url = BILL_URL % (session_url, bill_type, number)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # find <a name="Title">, get parent dt, get parent dl, then dd n dl
            title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

            synopsis = doc.xpath('//font[@size="3"]/p/text()')[0].strip()

            #print "%s %d %s" % (bill_type, number, title)

            if 'B' in bill_type:
                _type = ['bill']
            elif 'J' in bill_type:
                _type = ['joint resolution']

            bill = Bill(session, chamber, "%s %d" % (bill_type, number), title,
                        type=_type, synopsis=synopsis)
            bill.add_source(url)

            self.parse_bill_sponsors(doc, bill)     # sponsors
            self.parse_bill_actions(doc, bill)      # actions
            self.parse_bill_documents(doc, bill)    # documents and versions
            self.parse_bill_votes(doc, bill)        # votes

            # subjects
            subjects = []
            for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
                subjects.append(subj.text.split('-see also-')[0])
            bill['subjects'] = subjects

            # add bill to collection
            self.save_bill(bill)
Esempio n. 21
0
    def scrape_assem_bills(self, chamber, insert, session):
        
        doc_type = [1, 3, 5, 6]
        for doc in doc_type:
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, doc)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                with self.urlopen(page_path) as page:
                    root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')
                    bill = Bill(session, chamber, bill_id, title)

                    primary, secondary = self.scrape_sponsors(page_path)
                    
                    if primary[0] == 'By:':
                        primary.pop(0)
                        
                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    self.scrape_actions(page_path, bill, "Assembly")
                    self.scrape_votes(page_path, bill, "Assembly", insert, title)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Esempio n. 22
0
    def scrape_bill(self, chamber, session, bill_type, number):
        """ Creates a bill object
        """
        if len(session) == 4:
            session_url = session + "rs"
        else:
            session_url = session
        url = BILL_URL % (session_url, bill_type, number)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # find <a name="Title">, get parent dt, get parent dl, then dd n dl
            title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

            # create the bill object now that we have the title
            print "%s %d %s" % (bill_type, number, title)

            if "B" in bill_type:
                _type = ["bill"]
            elif "J" in bill_type:
                _type = ["joint resolution"]

            bill = Bill(session, chamber, "%s %d" % (bill_type, number), title, type=_type)
            bill.add_source(url)

            self.parse_bill_sponsors(doc, bill)  # sponsors
            self.parse_bill_actions(doc, bill)  # actions
            self.parse_bill_documents(doc, bill)  # documents and versions
            self.parse_bill_votes(doc, bill)  # votes

            # subjects
            subjects = []
            for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
                subjects.append(subj.text.split("-see also-")[0])
            bill["subjects"] = subjects

            # add bill to collection
            self.save_bill(bill)
Esempio n. 23
0
    def parse_bill(self, chamber, session, special, link):
        bill_number = link.contents[0]
        type = re.search('type=(B|R|)', link['href']).group(1)
        bill_id = "%s%s %s" % (bill_abbr(chamber), type, bill_number)

        bill_info_url = info_url(chamber, session, special, type, bill_number)

        with self.urlopen(bill_info_url) as info_page:
            info_page = BeautifulSoup(info_page)
            title_label = info_page.find(text='Short Title:')
            title = title_label.findNext().contents[0]

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(bill_info_url)

            self.parse_bill_versions(bill, info_page)

            self.parse_history(bill, history_url(chamber, session, special,
                                                 type, bill_number))

            self.parse_votes(bill, vote_url(chamber, session, special,
                                            type, bill_number))

            self.save_bill(bill)
Esempio n. 24
0
    def scrape_bill(self, chamber, session, bill_number, ga_num):
        bill_url = self.urls['info'] % (bill_number, ga_num)

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            title = page.xpath("//span[@id='lblAbstract']")[0].text
            
            bill = Bill(session, chamber, bill_number, title)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                bill.add_action(chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)
Esempio n. 25
0
    def scrape_session_new(self, chamber, session):
        if chamber == "lower":
            bill_abbr = "H."
        else:
            bill_abbr = "S."

        bill_list_path = "docs/bills.cfm?Session=%s&Body=%s" % (
            session.split('-')[1], bill_abbr[0])
        bill_list_url = "http://www.leg.state.vt.us/" + bill_list_path
        bill_list = BeautifulSoup(self.urlopen(bill_list_url))

        bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0])
        for bill_link in bill_list.findAll('a', href=bill_link_re):
            bill_id = bill_link.string
            bill_title = bill_link.parent.findNext('b').string
            bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_source(bill_info_url)

            info_page = BeautifulSoup(self.urlopen(bill_info_url))

            text_links = info_page.findAll('blockquote')[1].findAll('a')
            for text_link in text_links:
                bill.add_version(text_link.string,
                                 "http://www.leg.state.vt.us" +
                                 text_link['href'])

            act_table = info_page.findAll('blockquote')[2].table
            for row in act_table.findAll('tr')[1:]:
                action = ""
                for s in row.findAll('td')[1].findAll(text=True):
                    action += s + " "
                action = clean_action(action)

                match = re.search('Governor on (.*)$', action)
                if match:
                    act_date = parse_exec_date(match.group(1).strip())
                    actor = 'Governor'
                else:
                    if row['bgcolor'] == 'Salmon':
                        actor = 'lower'
                    else:
                        actor = 'upper'

                    if row.td.a:
                        act_date = row.td.a.string
                    else:
                        act_date = row.td.string

                    try:
                        act_date = re.search(
                            '\d{1,2}/\d{1,2}/\d{4,4}', act_date).group(0)
                    except AttributeError:
                        # No date, skip
                        continue

                    act_date = dt.datetime.strptime(act_date, '%m/%d/%Y')

                bill.add_action(actor, action, act_date,
                                type=action_type(action))

                vote_link = row.find('a', text='Details')
                if vote_link:
                    vote_url = vote_link.parent['href']
                    self.parse_vote_new(bill, actor, vote_url)

            sponsors = info_page.find(
                text='Sponsor(s):').parent.parent.findAll('b')
            bill.add_sponsor('primary', sponsors[0].string)
            for sponsor in sponsors[1:]:
                bill.add_sponsor('cosponsor', sponsor.string)

            self.save_bill(bill)
Esempio n. 26
0
    def scrape_session_old(self, chamber, session):
        if chamber == "lower":
            bill_abbr = "H."
            chamber_name = "House"
            other_chamber = "Senate"
        else:
            bill_abbr = "S."
            chamber_name = "Senate"
            other_chamber = "House"

        start_date = '1/1/%s' % session.split('-')[0]
        data = urllib.urlencode({'Date': start_date,
                                 'Body': bill_abbr[0],
                                 'Session': session.split('-')[1]})
        bill_list_url = "http://www.leg.state.vt.us/database/"\
            "rintro/results.cfm"
        bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data))

        bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0])
        for bill_link in bill_list.findAll('a', href=bill_link_re):
            bill_id = bill_link.string
            bill_title = bill_link.parent.parent.findAll('td')[1].string
            bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_source(bill_info_url)

            info_page = BeautifulSoup(self.urlopen(bill_info_url))

            text_links = info_page.findAll('blockquote')[-1].findAll('a')
            for text_link in text_links:
                bill.add_version(text_link.string,
                                 "http://www.leg.state.vt.us" +
                                 text_link['href'])

            sponsors = info_page.find(
                text='Sponsor(s):').parent.findNext('td').findAll('b')
            bill.add_sponsor('primary', sponsors[0].string)
            for sponsor in sponsors[1:]:
                bill.add_sponsor('cosponsor', sponsor.string)

            # Grab actions from the originating chamber
            act_table = info_page.find(
                text='%s Status:' % chamber_name).findNext('table')
            for row in act_table.findAll('tr')[3:]:
                action = clean_action(row.td.string.replace(
                        '&nbsp;', '').strip(':'))

                act_date = row.findAll('td')[1].b.string.replace('&nbsp;', '')
                if act_date != "":
                    detail = row.findAll('td')[2].b
                    if detail and detail.string != "":
                        action += ": %s" % detail.string.replace('&nbsp;', '')
                    bill.add_action(chamber, action, act_date,
                                    type=action_type(action))

            # Grab actions from the other chamber
            act_table = info_page.find(
                text='%s Status:' % other_chamber).findNext('table')
            if act_table:
                if chamber == 'upper':
                    act_chamber = 'lower'
                else:
                    act_chamber = 'upper'
                for row in act_table.findAll('tr')[3:]:
                    action = clean_action(row.td.string.replace(
                            '&nbsp;', '').strip(':'))

                    act_date = row.findAll('td')[1].b.string.replace(
                        '&nbsp;', '')
                    if act_date != "":
                        detail = row.findAll('td')[2].b
                        if detail and detail.string != "":
                            action += ": %s" % detail.string.replace(
                                '&nbsp;', '')
                        date = dt.datetime.strptime(act_date, '%m/%d/%Y')
                        bill.add_action(act_chamber, action, act_date,
                                        type=action_type(action))

            self.save_bill(bill)
Esempio n. 27
0
    def scrape_session(self, chamber, year):
        if chamber == 'upper':
            bill_abbr = 'SB|SCR|SJR'
        elif chamber == 'lower':
            bill_abbr = 'HB|HCR|HJR'

        # Sessions last 2 years, 1993-1994 was the 18th
        session = str(18 + ((int(year) - 1993) / 2))
        year2 = str(int(year) + 1)

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year2[2:]

        # Get bill list
        bill_list_url = 'http://www.legis.state.ak.us/'\
            'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % (
            session, date1, date2)
        self.log("Getting bill list for %s %s (this may take a long time)." %
                 (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(' ', '')
            bill_name = link.parent.parent.findNext('td').find(
                'font').contents[0].strip()
            bill = Bill(session, chamber, bill_id, bill_name.strip())

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link['href']
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(
                text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(
                ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
                spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(',')
                bill.add_sponsor('primary', sponsors[0].strip())

                for sponsor in sponsors[1:]:
                    bill.add_sponsor('cosponsor', sponsor.strip())
            else:
                # Committee sponsorship
                bill.add_sponsor('committee', spons_str.strip())

            # Get actions
            act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:]
            for row in act_rows:
                cols = row.findAll('td')
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, '%m/%d/%y')

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    try:
                        vote = self.parse_vote(bill, action,
                                               act_chamber, act_date,
                                               cols[1].a['href'])
                        bill.add_vote(vote)
                    except:
                        self.log("Failed parsing vote at %s" %
                                 cols[1].a['href'])

                bill.add_action(act_chamber, action, act_date)

            # Get subjects
            bill['subjects'] = []
            subject_link_re = re.compile('.*subject=\w+$')
            for subject_link in info_page.findAll('a', href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill['subjects'].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session, bill_id)
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile('^get_bill_text?')
            for text_link in text_list.findAll('a', href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (
                    text_link['href'])

                bill.add_version(text_name, text_url)

            self.save_bill(bill)
Esempio n. 28
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            doc_name = self._doctypes[rec['doctype']]
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        #Senate Votes
        file1 = 'A' + str(year_abr)
        file2 = 'A' + str(year_abr + 1)
        file3 = 'S' + str(year_abr)
        file4 = 'S' + str(year_abr + 1)
        if str(year_abr) != '2010':
            vote_info_list = [file1, file2, file3, file4]
        else:
            vote_info_list = [file1, file3]
        for bill_vote_file in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % bill_vote_file
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if bill_vote_file[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            for rec in vdict_file:
                bill_id = rec["Bill"]
                bill_id = bill_id.strip()
                leg = rec["Full_Name"]

                date = rec["Session_Date"]
                date = datetime.strptime(date, "%m/%d/%Y")
                action = rec["Action"]
                leg_vote = rec["Legislator_Vote"]
                vote_id = bill_id + "_" + action
                vote_id = vote_id.replace(" ", "_")
                passed = None

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, passed, None,
                                          None, None, bill_id=bill_id)
                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')


        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Esempio n. 29
0
    def scrape_year(self, chamber, year, session):
        if chamber == 'upper':
            chamber_abbr = 'H'
        elif chamber == 'lower':
            chamber_abbr = 'S'

        #set up POST data
        values = [('txtsessionyear', year),
                  ('txttitle', ''),
                  ('txtlsrnumber', ''),
                  ('Submit1', 'Submit')]
        params = urllib.urlencode(values)
        search_url = 'http://www.gencourt.state.nh.us/bill_status/Results.aspx'

        #request page with list of all bills in year
        with self.urlopen(search_url + '?' + params) as doc:
            soup = BeautifulSoup(doc)

            #parse results
            bills = soup.find("table", {"class": "ptable"})
            trs = soup.findAll("tr")
            #go through all of the table rows with relevant data
            tr_start = 8
            tr_hop = 11
            i = 0

            while (tr_start + (tr_hop * i)) < len(trs):
                tr = trs[tr_start + (tr_hop * i)]
                i = i + 1
                # strip off extra white space from name
                id = tr.find("big").string.strip()
                bill_id = tr.find("big").string.strip()
                exp = re.compile("^(\w*)")
                bill_id = exp.search(id).group(1)

                # check to see if its in the proper chamber
                exp = re.compile("^" + chamber_abbr)
                if exp.search(bill_id) == None:
                    continue  # in wrong house

                # check to see it is a bill and not a resolution
                exp = re.compile("B")
                if exp.search(bill_id) == None:
                    continue  # not a bill

                # get bill_id suffix if exists
                exp = re.compile("(-\w*)$")
                res = exp.search(id)
                if res != None:
                    bill_id = bill_id + res.group(1)

                # get bill title
                title = tr.findAll("b")[0]
                bill_title = title.nextSibling.string
                bill_title = bill_title.strip()
                bill_title = bill_title.encode('ascii', 'xmlcharrefreplace')

                # grab url of bill text
                urls = tr.findAll("a")
                textexp = re.compile("Bill Text")
                textdoc = re.compile("Bill Docket")
                textstat = re.compile("Bill Status")
                textcall = re.compile("Roll Calls")
                textaudio = re.compile("Audio Files")
                for url in urls:
                    if textexp.search(str(url.string)) != None:
                        bill_url = self.get_bill_text(url)
                    if textdoc.search(str(url.string)) != None:
                        pass
                    if textstat.search(str(url.string)) != None:
                        add_bill_sponsors()
                    if textcall.search(str(url.string)) != None:
                        pass
                    if textaudio.search(str(url.string)) != None:
                        pass

                bill = Bill(session, chamber, bill_id, bill_title)
                bill.add_version("Bill text", bill_url)
                bill.add_source(search_url)
                self.save_bill(bill)
Esempio n. 30
0
    def scrape_bill(self, chamber, session, url):
        url = url + "&Year=%s" % session
        with self.urlopen(url) as page:
            page = page.replace('&nbsp;', ' ').replace('<br>', '\n')
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath('//h3')[0].text.strip()
            title = re.match(r"^\w+\s+\d+:\s+(.*)$", title).group(1)

            bill_id = page.xpath("string(//pre[@class='billhistory']/b)")
            bill_id = bill_id.split()[0].strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            hist = page.xpath("string(//pre[@class='billhistory'])").strip()
            act_re = re.compile(r'^  (\d\d/\d\d/\d\d) (SENATE|HOUSE)'
                                r'(.*\n(\s{16,16}.*\n){0,})',
                                re.MULTILINE)

            # Actions
            for match in act_re.finditer(hist):
                action = match.group(3).replace('\n', ' ')
                action = re.sub(r'\s+', ' ', action).strip()

                if match.group(2) == 'SENATE':
                    actor = 'upper'
                else:
                    actor = 'lower'

                date = match.group(1)
                date = datetime.datetime.strptime(date, "%m/%d/%y")

                for act_text in re.split(' -[HS]J \d+;? ?', action):
                    act_text = act_text.strip()
                    if not act_text:
                        continue

                    types = []
                    act_lower = act_text.lower()
                    if act_lower.startswith('introduced'):
                        types.append('bill:introduced')
                    if 'referred to' in act_lower:
                        types.append('committee:referred')
                    if 'died in committee' in act_lower:
                        types.append('committee:failed')
                    if 'favorable by' in act_lower:
                        types.append('committee:passed:favorable')
                    if 'amendment(s) adopted' in act_lower:
                        types.append('amendment:passed')

                    bill.add_action(actor, act_text, date, type=types)

            # Sponsors
            primary_sponsor = re.search(r'by ([^;(\n]+;?|\w+)',
                                        hist).group(1).strip('; ')
            bill.add_sponsor('primary', primary_sponsor)

            cospon_re = re.compile(r'\((CO-SPONSORS|CO-AUTHORS)\) '
                                   '([\w .]+(;[\w .\n]+){0,})',
                                   re.MULTILINE)
            match = cospon_re.search(hist)

            if match:
                for cosponsor in match.group(2).split(';'):
                    cosponsor = cosponsor.replace('\n', '').strip()
                    bill.add_sponsor('cosponsor', cosponsor)

            # Versions
            for link in page.xpath("//a[contains(@href, 'billtext/html')]"):
                version = link.xpath('string(../../td[1])').strip()

                bill.add_version(version, link.attrib['href'])

            # House Votes
            for link in page.xpath("//a[contains(@href, 'votes/html/h')]"):
                bill.add_vote(self.scrape_lower_vote(link.attrib['href']))

            # Senate Votes
            for link in page.xpath("//a[contains(@href, 'votes/html/S')]"):
                bill.add_vote(self.scrape_upper_vote(link.attrib['href']))

            self.save_bill(bill)