Example #1
0
 def scrape_bill(self, chamber, session, url):
     page = self.urlopen(url)
     root = lxml.html.fromstring(page)
     bill_detail_el = root.xpath('//div[@class="col2"]//div[@class="Columns bg2717"]//div[@class="widgetContent"]')[0]
     title = bill_detail_el.xpath('.//p/text()')[0]
     bill_id = bill_detail_el.xpath('./p/b/text()')[1].strip()
     m = re.search('Bill Number: ([HSD0-9]+)', bill_id)
     if len(m.groups()):
         bill_id = m.groups()[0]
     else:
         bill_id = None
     doctype = None
     if self._last_doctype:
         doctype = self._last_doctype.lower()
     bill = Bill(session, chamber, bill_id, title, type=doctype)
     sponsors_el = bill_detail_el.xpath('./p[2]/a/text()')
     for i in range(len(sponsors_el)):
         sponsor = sponsors_el[i]
         if i == 0:
             type = 'primary'
         else:
             type = 'cosponsor'
         bill.add_sponsor(type, sponsor)
     secondary_sponsors_el = bill_detail_el.xpath('.//div[@class="dataBlock"]//td/a/text()')
     for secondary_sponsor in secondary_sponsors_el:
         bill.add_sponsor('secondary', secondary_sponsor)
     print bill
     self.save_bill(bill)
Example #2
0
    def scrape_bill(self, chamber, session, bill_type, number):
        """ Creates a bill object
        """
        if len(session) == 4:
            session_url = session+'rs'
        else:
            session_url = session
        url = BILL_URL % (session_url, bill_type, number)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # title
            # find <a name="Title">, get parent dt, get parent dl, then get dd within dl
            title = doc.cssselect('a[name=Title]')[0] \
                .getparent().getparent().cssselect('dd')[0].text.strip()

            # create the bill object now that we have the title
            print "%s %d %s" % (bill_type, number, title)
            bill = Bill(session, chamber, "%s %d" % (bill_type, number), title)
            bill.add_source(url)

            self.parse_bill_sponsors(doc, bill)     # sponsors
            self.parse_bill_actions(doc, bill)      # actions
            self.parse_bill_documents(doc, bill)    # documents and versions
            self.parse_bill_votes(doc, bill)        # votes

            # add bill to collection
            self.save_bill(bill)
Example #3
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        bill_type = re.search('type=(B|R|)', link.attrib['href']).group(1)
        bill_id = "%s%s %s" % (bill_abbr(chamber), bill_type, bill_num)

        url = info_url(chamber, session, special, bill_type, bill_num)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath(
                "//td[text() = 'Short Title:']/following-sibling::td")[0]
            title = title.text.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            self.parse_bill_versions(bill, page)

            self.parse_history(bill, history_url(chamber, session, special,
                                                 bill_type, bill_num))

            self.parse_votes(bill, vote_url(chamber, session, special,
                                            bill_type, bill_num))

            self.save_bill(bill)
Example #4
0
    def parse_senate_billpage(self, bill_url, year):
        with self.urlopen(bill_url) as bill_page:
            bill_page = BeautifulSoup(bill_page)
            # get all the info needed to record the bill
            bill_id = bill_page.find(id="lblBillNum").b.font.contents[0]
            bill_title = bill_page.find(id="lblBillTitle").font.string
            bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0]
            bill_lr = bill_page.find(id="lblLRNum").font.string

            bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url,
                        bill_lr=bill_lr, official_title=bill_title)
            bill.add_source(bill_url)

            # Get the primary sponsor
            bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0]
            bill_sponsor_link = bill_page.find(id="hlSponsor").href
            bill.add_sponsor('primary', bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # cosponsors show up on their own page, if they exist
            cosponsor_tag = bill_page.find(id="hlCoSponsors")
            if cosponsor_tag and 'href' in cosponsor_tag:
                self.parse_senate_cosponsors(bill, cosponsor_tag['href'])

            # get the actions
            action_url = bill_page.find(id="hlAllActions")['href']
            self.parse_senate_actions(bill, action_url)

            # stored on a separate page
            versions_url = bill_page.find(id="hlFullBillText")
            if versions_url:
                self.parse_senate_bill_versions(bill, versions_url['href'])

        self.save_bill(bill)
Example #5
0
    def scrape_bill(self, bill_url, chamber, session):
        with self.urlopen(bill_url) as text:
            if "Specified Bill could not be found" in text:
                return False
            page = lxml.html.fromstring(text)
            page.make_links_absolute(bill_url)

            bill_id = page.xpath("string(//h2)").split()[0]

            summary = page.xpath(
                "string(//*[starts-with(text(), 'Summary: ')])")
            summary = summary.replace('Summary: ', '')

            match = re.match(r"^([^:]+): ([^(]+)", summary)

            if match:
                subjects = [match.group(1).strip()]
                title = match.group(2).strip()
            else:
                raise ScrapeError("Bad title")

            bill = Bill(session, chamber, bill_id, title,
                        subjects=subjects)
            bill.add_source(bill_url)

            history_link = page.xpath("//a[text() = 'History']")[0]
            history_url = history_link.attrib['href']
            self.scrape_history(bill, history_url)

            authors_link = page.xpath("//a[text() = 'Authors']")[0]
            authors_url = authors_link.attrib['href']
            self.scrape_authors(bill, authors_url)

            try:
                versions_link = page.xpath(
                    "//a[text() = 'Text - All Versions']")[0]
                versions_url = versions_link.attrib['href']
                self.scrape_versions(bill, versions_url)
            except IndexError:
                # Only current version
                try:
                    version_link = page.xpath(
                        "//a[text() = 'Text - Current']")[0]
                    version_url = version_link.attrib['href']
                    bill.add_version("%s Current" % bill_id, version_url)
                except IndexError:
                    # Some bills don't have any versions :(
                    pass

            try:
                votes_link = page.xpath("//a[text() = 'Votes']")[0]
                self.scrape_votes(bill, votes_link.attrib['href'])
            except IndexError:
                # Some bills don't have any votes
                pass

            self.save_bill(bill)

            return True
Example #6
0
 def parse_special_session_bill_status_page(self, bill_id, status_page, bill_table, session, chamber, sources):
     title = bill_table.xpath('//tr[3]/td[2]')[0].text_content()
     bill = Bill(session, chamber, bill_id, title)
     for source in sources:
         bill.add_source(source)
     self.add_sponsors(bill, self.get_sponsor_table(status_page))
     self.add_actions(bill, self.get_action_table(status_page))
     return bill
Example #7
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('center table center')[0]

            for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table')[-1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Example #8
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table')

            # Bill
            name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[2].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Example #9
0
 def scrape(self, chamber, session):   
     year = year_from_session(session)
     url = bills_url(year)
     with self.urlopen(url) as bills_page_html:
         bills_page = lxml.html.fromstring(bills_page_html)
         table_rows = bills_page.cssselect('tr')
         # Eliminate empty rows
         table_rows = table_rows[0:len(table_rows):2]
         for row in table_rows:
             row_elements = row.cssselect('td')
             
             bill_document = row_elements[0]
             bill_document.make_links_absolute(base_url())
             
             element, attribute, link, pos = bill_document.iterlinks().next()
             bill_id = element.text_content().rstrip('.pdf')
             bill_document_link = link           
             
             title_and_sponsors = row_elements[1]
             title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content())
             sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content())
             title = title_match.group(1)
             sponsors =  sponsors_match.group(1)
             separated_sponsors = sponsors.split('--')
             
             bill = Bill(session, chamber, bill_id, title)
             bill.add_version('current', bill_document_link)
             
             if separated_sponsors[1] == '(NONE)':
                 bill.add_sponsor('primary', separated_sponsors[0])
             
             else:
                 bill.add_sponsor('cosponsor', separated_sponsors[0])
                 bill.add_sponsor('cosponsor', separated_sponsors[1])                
            
             
             versions_page_element = row_elements[2]
             versions_page_element.make_links_absolute(base_url())
             element, attribute, link, pos = versions_page_element.iterlinks().next()
             
             bill.add_source(link)
             
             self.scrape_versions(link, bill)
                   
             actions_page_element = row_elements[3]
             element, attribute, link, pos = actions_page_element.iterlinks().next()
             frame_link = base_url() + link.split('?Open&target=')[1]
             
             self.scrape_actions(frame_link, bill)
             
             votes_page_element = row_elements[7]
             element, attribute, link, pos = votes_page_element.iterlinks().next()
             frame_link = base_url() + link.split('?Open&target=')[1]
             self.scrape_votes(frame_link, chamber, bill)
                             
                     
                        
                     
                 
             
Example #10
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('#legislation h1')[0].text_content().strip()

            bill_id = name.split(' - ')[0].strip()

            bill = Bill(session, chamberName, bill_id, name)

            # Sponsorships
            for a in page.cssselect("#sponsors a"):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in page.cssselect('#history tr')[1:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()

                if '/' not in date:
                    continue

                date = datetime.datetime.strptime(date, '%m/%d/%Y')

                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in page.cssselect('#versions a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Example #11
0
    def scrape(self, chamber, session):
        self.site_id = self.metadata['session_details'][session]['internal_id']
        chamber_piece = {'upper': 'Senate',
                         'lower': 'House+of+Representatives'}[chamber]

        # resolutions
        # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction

        url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece

        self.refresh_session()

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # bills are all their own table with cellspacing=4 (skip first)
            bill_tables = doc.xpath('//table[@cellspacing="4"]')
            for bt in bill_tables[1:]:

                # each table has 3 rows: detail row, description, blank
                details, desc, _ = bt.xpath('tr')

                # first <tr> has img, button, sponsor, topic, current house
                #   current status, committee, committee2, last action
                _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td')

                # pull bill_id out of script tag (gross)
                bill_id = bill_id_re.search(button.text_content()).group()
                oid = btn_re.search(button.text_content()).groups()[0]

                sponsor = sponsor.text_content()
                topic = topic.text_content()
                com1 = com1.text_content()
                com2 = com2.text_content()
                desc = desc.text_content()

                # create bill
                bill = Bill(session, chamber, bill_id, desc.strip(),
                            topic=topic)
                bill.add_sponsor(sponsor, 'primary')

                self.get_sponsors(bill, oid)
                self.get_actions(bill, oid)

                # craft bill URL
                session_fragment = '2010rs'
                type_fragment = 'bills'
                bill_id_fragment = bill_id.lower()
                bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % (
                    session_fragment, type_fragment, bill_id_fragment)
                bill.add_version('bill text', bill_text_url)

                self.save_bill(bill)
Example #12
0
    def scrape(self, chamber, year):
        # Data prior to 1997 is contained in pdfs
        if year < "1997":
            raise NoDataForYear(year)

        bills_url = "http://www.leg.state.co.us/CLICS/CLICS" + year + "A/csl.nsf/%28bf-1%29?OpenView&Count=2000"
        with self.lxml_context(bills_url) as bills_page:
            table_rows = bills_page.cssselect("tr")
            # Eliminate empty rows
            table_rows = table_rows[0 : len(table_rows) : 2]
            for row in table_rows:
                print "row"
                row_elements = row.cssselect("td")

                bill_document = row_elements[0]
                bill_document.make_links_absolute("http://www.leg.state.co.us")

                element, attribute, link, pos = bill_document.iterlinks().next()
                bill_id = element.text_content().rstrip(".pdf")
                bill_document_link = link

                title_and_sponsors = row_elements[1]
                title_match = re.search("([A-Z][a-z]+.+[a-z])[A-Z]", title_and_sponsors.text_content())
                sponsors_match = re.search("[a-z]([A-Z]+.+)", title_and_sponsors.text_content())
                title = title_match.group(1)
                sponsors = sponsors_match.group(1)
                separated_sponsors = sponsors.split("--")

                bill = Bill(year, chamber, bill_id, title)
                bill.add_version("current", bill_document_link)

                if separated_sponsors[1] == "(NONE)":
                    bill.add_sponsor("primary", separated_sponsors[0])

                else:
                    bill.add_sponsor("cosponsor", separated_sponsors[0])
                    bill.add_sponsor("cosponsor", separated_sponsors[1])

                versions_page_element = row_elements[2]
                versions_page_element.make_links_absolute("http://www.leg.state.co.us")
                element, attribute, link, pos = versions_page_element.iterlinks().next()

                bill.add_source(link)

                self.scrape_versions(link, bill)

                actions_page_element = row_elements[3]
                element, attribute, link, pos = actions_page_element.iterlinks().next()
                frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1]

                self.scrape_actions(frame_link, bill)

                votes_page_element = row_elements[7]
                element, attribute, link, pos = votes_page_element.iterlinks().next()
                frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1]

                self.scrape_votes(link, chamber, bill)
Example #13
0
    def parse_standard_bill_status_page(self, bill_id, status_page, session, chamber, sources):
        try:
            title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content()
        except IndexError:
            if len(status_page.xpath("/html/html")) == 2:
                title = status_page.xpath('/html/html[2]/tr[1]/td[2]')[0].text_content()
            else:
                title = status_page.xpath('/html/html[3]/tr[1]/td[2]')[0].text_content()

        bill = Bill(session, chamber, bill_id, title)
        for source in sources:
            bill.add_source(source)
        self.add_sponsors(bill, self.get_sponsor_table(status_page))
        self.add_actions(bill, self.get_action_table(status_page))

        return bill
Example #14
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            bill_soup = BeautifulSoup(bill_html)

            bill_id = self.extract_bill_id(bill_soup)
            bill_title =  self.extract_bill_title(bill_soup)
            bill = Bill(session, chamber, bill_id, bill_title)

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.

        with self.urlopen(version_list_url) as version_html:
            version_soup = BeautifulSoup(version_html)

            # MN bills can have multiple versions.  Get them all, and loop over
            # the results, adding each one.
            self.debug("Extracting bill versions from: " + version_list_url)
            bill_versions = self.extract_bill_versions(version_soup)
            for version in bill_versions:
                version_name = version['name']
                version_url = urlparse.urljoin(VERSION_URL_BASE, version['url'])
                bill.add_version(version_name, version_url)

            # grab primary and cosponsors
            # MN uses "Primary Author" to name a bill's primary sponsor.
            # Everyone else listed will be added as a 'cosponsor'.
            sponsors = self.extract_bill_sponsors(bill_soup)
            primary_sponsor = sponsors[0]
            cosponsors = sponsors[1:]
            bill.add_sponsor('primary', primary_sponsor)
            for leg in cosponsors:
                bill.add_sponsor('cosponsor', leg)

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(bill_soup, chamber)
            for action in bill_actions:
                action_chamber = action['action_chamber']
                action_date = action['action_date']
                action_text = action['action_text']
                bill.add_action(action_chamber, action_text, action_date)

        self.save_bill(bill)
Example #15
0
 def scrape_session_2009(self, chamber, session):
     url, type = bills_url(chamber)
                 
     with self.urlopen(url) as page_html:
         page = lxml.html.fromstring(page_html)
         for element, attribute, link, pos in page.iterlinks():         
             if re.search("billtype=" + type + "&billnumber=[0-9]+", link) != None:
                 bill_page_url = bill_url(link)
                 with self.urlopen(bill_page_url) as bill_page_str:
                     bill_page = lxml.html.fromstring(bill_page_str)
                     splitted_link = link.split("=")
                     bill_number = splitted_link[-1]
                     bill_id = bill_page.cssselect('a[class="headerlink"]')
                     bill_id = bill_id[0]
                     bill_id = bill_id.text_content()
                     bill_title = bill_page.cssselect('td[style="color:Black"]')
                     bill_title = bill_title[0]
                     bill_title = bill_title.text_content()
                     bill = Bill(session, chamber, bill_id, bill_title)
                     bill.add_source(bill_page_url)
                     
                     actions_table_list = bill_page.cssselect('table[rules="all"]')
                     actions_table = actions_table_list[0]
                     action_elements = actions_table.cssselect('tr')
                     # first element is not an action element
                     action_elements.pop(0)
                     
                     for ae in action_elements:
                         action_element_parts = ae.cssselect('td')
                         
                         action_date = action_element_parts[0]
                         actor_house = action_element_parts[1]
                         action_text = action_element_parts[2]
                         
                         # look for acting comittees
                         match = re.search("(committee\(s\)|committee) on ([A-Z]{3}(/|-)[A-Z]{3}|[A-Z]{3})", action_text.text_content())
 
                         if(match != None):
                             actor = match.group(0)
                         elif(actor_house == 'H'):
                             actor = "lower"
                         elif (actor_house == 'S'):
                             actor = "upper"
                         else:
                             actor = chamber
                         
                         action_date = dt.datetime.strptime(action_date.text_content(), '%m/%d/%Y') 
                             
                         if (re.search("The votes were as follows", action_text.text_content()) != None):
                             self.scrape_votes(action_text.text_content(), bill_page_url, actor_house, action_date, bill)
                                                        
                         bill.add_action(actor, action_text, action_date)
                     
                     with self.urlopen(versions_page_url(type, bill_number)) as versions_page_html:
                         versions_page = lxml.html.fromstring(versions_page_html)
                         versions_elements = versions_page.cssselect('span[class="searchtitle"]')
                         for ve in versions_elements:
                             element_text = ve.text_content()
                             version_name = element_text.rstrip("_.HTM")
                             bill.add_version(version_name, bill_version_url(element_text))
Example #16
0
    def scrape_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            header = page.xpath('//h3/br')[0].tail.replace('&nbsp;', ' ')
            title, primary_sponsor = header.split(' -- ')

            if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
                bill_type = ['bill']
            elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
                bill_type = ['concurrent resolution']
            elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
                bill_type = ['joint resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_sponsor('primary', primary_sponsor)
            bill.add_source(url)

            status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
            self.parse_status(bill, status_link.attrib['href'])

            for link in page.xpath(
                '//a[contains(@href, "bills/") and text() = "HTML"]'):

                name = link.getprevious().tail.strip()
                bill.add_version(name, link.attrib['href'])

            self.save_bill(bill)
Example #17
0
    def scrape_bill(self, session, chamber, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                return
            title = title.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Example #18
0
    def scrape(self, chamber, session):
        # internal id for the session, store on self so all methods have access
        self.site_id = self.metadata['session_details'][session]['site_id']

        self.build_subject_map()

        # used for skipping bills from opposite chamber
        start_letter = 'H' if chamber == 'lower' else 'S'

        url = 'http://leg6.state.va.us/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id

        while url:
            with self.urlopen(url) as html:
                doc = lxml.html.fromstring(html)

                url = None  # no more unless we encounter 'More...'

                bills = doc.xpath('//ul[@class="linkSect"]/li')
                for bill in bills:
                    link = bill.getchildren()[0]
                    bill_id = link.text_content()

                    # check if this is the 'More...' link
                    if bill_id == 'More...':
                        url = BASE_URL + link.get('href')

                    # skip bills from the other chamber
                    elif not bill_id.startswith(start_letter):
                        continue

                    else:
                        # create a bill
                        desc = bill.xpath('text()')[0].strip()
                        bill_type = {'B': 'bill',
                                     'J': 'joint resolution',
                                     'R': 'resolution'}[bill_id[1]]
                        bill = Bill(session, chamber, bill_id, desc,
                                    type=bill_type)

                        bill_url = BASE_URL + link.get('href')
                        self.fetch_sponsors(bill)
                        self.scrape_bill_details(bill_url, bill)
                        bill['subjects'] = self.subject_map[bill_id]
                        bill.add_source(bill_url)
                        self.save_bill(bill)
Example #19
0
    def scrape1995(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip()
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            rows = page.cssselect('center table tr')
            for row in rows:
                if row.text_content().strip() == 'Sponsor and CoSponsors':
                    continue
                if row.text_content().strip() == 'Links / Committees / Status':
                    break
                for a in row.cssselect('a'):
                    bill.add_sponsor('', a.text_content().strip())

            # Actions
            # The actions are in a pre table that looks like:
            """    SENATE                         HOUSE
                   -------------------------------------
                 1/13/95   Read 1st time          2/6/95
                 1/31/95   Favorably Reported
                 2/1/95    Read 2nd Time          2/7/95
                 2/3/95    Read 3rd Time
                 2/3/95    Passed/Adopted                   """

            actions = page.cssselect('pre')[0].text_content().split('\n')
            actions = actions[2:]
            for action in actions:
                senate_date = action[:22].strip()
                action_text = action[23:46].strip()
                house_date = action[46:].strip()

                if '/' not in senate_date and '/' not in house_date:
                    continue

                if senate_date:
                    bill.add_action('upper', action_text, senate_date)

                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Example #20
0
    def scrape_bill(self, bill_url, chamber, session):
        with self.urlopen(bill_url) as text:
            if "Specified Bill could not be found" in text:
                return False
            page = lxml.html.fromstring(text)
            page.make_links_absolute(bill_url)

            bill_id = page.xpath("string(//h2)").split()[0]

            summary = page.xpath(
                "string(//*[starts-with(text(), 'Summary: ')])")
            summary = summary.replace('Summary: ', '')

            bill = Bill(session, chamber, bill_id, summary)

            history_link = page.xpath("//a[text() = 'History']")[0]
            history_url = history_link.attrib['href']
            self.scrape_history(bill, history_url)

            authors_link = page.xpath("//a[text() = 'Authors']")[0]
            authors_url = authors_link.attrib['href']
            self.scrape_authors(bill, authors_url)

            try:
                versions_link = page.xpath(
                    "//a[text() = 'Text - All Versions']")[0]
                versions_url = versions_link.attrib['href']
                self.scrape_versions(bill, versions_url)
            except IndexError:
                # Only current version
                try:
                    version_link = page.xpath(
                        "//a[text() = 'Text - Current']")[0]
                    version_url = version_link.attrib['href']
                    bill.add_version("%s Current" % bill_id, version_url)
                except IndexError:
                    # Some bills don't have any versions :(
                    pass

            self.save_bill(bill)

            return True
Example #21
0
    def parse_bill_status_page(self, status_url, bill_url, session, chamber):
        status_page = ElementTree(lxml.html.fromstring(self.urlopen(status_url)))
        # see 2007 HB 2... weird.
        try:
            bill_id = status_page.xpath("/div/form[1]/table[2]/tr[2]/td[2]")[0].text_content()
        except IndexError:
            bill_id = status_page.xpath("/html/html[2]/tr[1]/td[2]")[0].text_content()

        try:
            title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content()
        except IndexError:
            title = status_page.xpath("/html/html[3]/tr[1]/td[2]")[0].text_content()

        bill = Bill(session, chamber, bill_id, title)
        bill.add_source(bill_url)

        self.add_sponsors(bill, status_page)
        self.add_actions(bill, status_page)

        return bill
Example #22
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == "lower":
            bill_abbr = "HB"
        else:
            bill_abbr = "SB"

        bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % (
            session.replace(' ', ''))
        self.log("Getting bill list for %s, %s" % (session, chamber))

        try:
            base_bill_list = self.soup_parser(self.urlopen(bill_list_url))
        except:
            # this session doesn't exist for this year
            return

        bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr)

        for link in base_bill_list.findAll('a', href=bill_list_link_re):
            bill_list = self.soup_parser(self.urlopen(link['href']))
            bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr)

            for bill_link in bill_list.findAll('a', href=bill_link_re):
                bill_id = bill_link.find(text=True).strip()

                bill_info_url = bill_link['href']
                bill_info = self.soup_parser(self.urlopen(bill_info_url))

                bill_title, primary_sponsor = bill_info.h3.contents[2].replace(
                    '&nbsp;', ' ').strip().split(' -- ')

                bill = Bill(session, chamber, bill_id, bill_title)
                bill.add_source(bill_info_url)
                bill.add_sponsor('primary', primary_sponsor)

                status_re = re.compile('.*billsta/%s.*.htm' %
                                       bill_abbr.lower())
                status_link = bill_info.find('a', href=status_re)

                if status_link:
                    self.parse_status(bill, status_link['href'])

                text_find = bill_info.find(
                    text="Bill Text (If you are having trouble viewing")

                if text_find:
                    text_link_re = re.compile('.*\.htm')
                    for text_link in text_find.parent.parent.findAll(
                        'a', href=text_link_re)[1:]:
                        version_name = text_link.previous.strip()
                        bill.add_version(version_name, text_link['href'])

                self.save_bill(bill)
Example #23
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            bill_no = 1
            abbr = 'SB'
        else:
            bill_no = 4001
            abbr = 'HB'
        while True:
            bill_page = self.scrape_bill(session, abbr, bill_no)
            bill_page = BeautifulSoup(bill_page)
            # if we can't find a page, we must be done. This is a healthy thing.
            if bill_page == None: return
            title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0]))
            title = title.replace('\n','').replace('\r','')
            bill_id = "%s %d" % (abbr, bill_no)

            the_bill = Bill(session, chamber, bill_id, title)

            #sponsors
            first = 0
            for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'):
                the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string)
                first = 1

            #versions
            for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'):
                r = self.parse_doc(the_bill, doc)
                if r: the_bill.add_version(*r)

            #documents
            if 'frg_billstatus_HlaTable' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)
            if 'frg_billstatus_SfaSection' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)

            self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0])
            self.save_bill(the_bill)
            bill_no = bill_no + 1
        pass
Example #24
0
    def scrape_bill(self, chamber, session, bill_type, number):
        """ Creates a bill object
        """
        if len(session) == 4:
            session_url = session+'rs'
        else:
            session_url = session
        url = BILL_URL % (session_url, bill_type, number)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # find <a name="Title">, get parent dt, get parent dl, then dd n dl
            title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

            synopsis = doc.xpath('//font[@size="3"]/p/text()')[0].strip()

            #print "%s %d %s" % (bill_type, number, title)

            if 'B' in bill_type:
                _type = ['bill']
            elif 'J' in bill_type:
                _type = ['joint resolution']

            bill = Bill(session, chamber, "%s %d" % (bill_type, number), title,
                        type=_type, synopsis=synopsis)
            bill.add_source(url)

            self.parse_bill_sponsors(doc, bill)     # sponsors
            self.parse_bill_actions(doc, bill)      # actions
            self.parse_bill_documents(doc, bill)    # documents and versions
            self.parse_bill_votes(doc, bill)        # votes

            # subjects
            subjects = []
            for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
                subjects.append(subj.text.split('-see also-')[0])
            bill['subjects'] = subjects

            # add bill to collection
            self.save_bill(bill)
Example #25
0
    def scrape_info(self, session, bill_number):
        bill_view_url = 'http://www.njleg.state.nj.us/bills/BillView.asp'
        bill_id = bill_number[0]
        bill_view_body = 'BillNumber=%s++++&LastSession=' % bill_number[0]
        with self.urlopen(bill_view_url, 'POST', bill_view_body) as bill_view_page:
            root = lxml.etree.fromstring(bill_view_page, lxml.etree.HTMLParser())
            title = bill_number[1]
            if bill_id[0] == 'A':
                chamber = 'General Assembly'
            elif bill_number[0][0] == 'S':
                chamber = 'Senate'
            bill = Bill(session, chamber, bill_id, title)

            #Grabbing sponsors
            sponsorship = root.xpath('string(//tr[1]/td[1]/div/font[3])').split()
            primary_count = sponsorship.count('Primary')
            sponsor_count = 1
            #Special case
            if session == 214 and bill_id == 'A101':
                sponsorship = root.xpath('string(//tr[1]/td[1]/div/font[5])').split()
                primary_count = sponsorship.count('Primary')
            for sp in root.xpath('//tr[1]/td[1]/div/font/a/font'):
                sponsor = sp.xpath('string()').split()
                if len(sponsor) == 3:
                    leg = sponsor[1] + " " + sponsor[2] + " " + sponsor[0]
                    leg = leg[0: len(leg) - 1]
                elif len(sponsor) == 2:
                    leg = sponsor[1] + " " + sponsor[0]
                    leg = leg[0: len(leg) - 1]

                if sponsor_count <= primary_count:
                    sponsor_type = 'Primary'
                if sponsor_count > primary_count:
                    sponsor_type = 'Co-sponsor'

                bill.add_sponsor(sponsor_type, leg)
                sponsor_count = sponsor_count + 1

            self.save_bill(bill)
Example #26
0
    def scrape_bill(self, chamber, session, billid, histurl, year):
        if year[0] != 'R':
            session = year
        else:
            session = self.metadata['session_details'][year][
                'sub_sessions'][int(year[0]) - 1]

        with self.urlopen(histurl) as data:
            soup = BeautifulSoup(cleansource(data))
            basicinfo = soup.findAll('div', id='bhistleft')[0]
            hist = basicinfo.table

            sponsor = None
            title = None
            for b in basicinfo.findAll('b'):
                if b.next.startswith('SUMMARY'):
                    title = b.findNextSiblings(text=True)[0].strip()
                elif b.next.startswith('SPONSOR'):
                    for a in b.findNextSiblings('a'):
                        if not issponsorlink(a):
                            break
                        sponsor = cleansponsor(a.contents[0])

            bill = Bill(session, chamber, billid, title)

            if sponsor:
                bill.add_sponsor('primary', sponsor)

            for row in hist.findAll('tr'):
                link = row.td.a
                vlink = urlbase % link['href']
                vname = link.contents[0].strip()
                bill.add_version(vname, vlink)

            history = soup.findAll('div', id='bhisttab')[0].table
            rows = history.findAll('tr')[1:]
            for row in rows:
                tds = row.findAll('td')
                if len(tds) < 2:
                    # This is not actually an action
                    continue
                date, action = row.findAll('td')[:2]
                date = dt.datetime.strptime(date.contents[0], '%m/%d/%y')
                action = action.contents[0].strip()
                if 'House' in action:
                    actor = 'lower'
                elif 'Senate' in action:
                    actor = 'upper'
                else:  # for lack of a better
                    actor = chamber

                bill.add_action(actor, action, date)

        self.save_bill(bill)
Example #27
0
    def scrape_bill(self, chamber, session, bill_type, number):
        """ Creates a bill object
        """
        if len(session) == 4:
            session_url = session + "rs"
        else:
            session_url = session
        url = BILL_URL % (session_url, bill_type, number)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # find <a name="Title">, get parent dt, get parent dl, then dd n dl
            title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

            # create the bill object now that we have the title
            print "%s %d %s" % (bill_type, number, title)

            if "B" in bill_type:
                _type = ["bill"]
            elif "J" in bill_type:
                _type = ["joint resolution"]

            bill = Bill(session, chamber, "%s %d" % (bill_type, number), title, type=_type)
            bill.add_source(url)

            self.parse_bill_sponsors(doc, bill)  # sponsors
            self.parse_bill_actions(doc, bill)  # actions
            self.parse_bill_documents(doc, bill)  # documents and versions
            self.parse_bill_votes(doc, bill)  # votes

            # subjects
            subjects = []
            for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
                subjects.append(subj.text.split("-see also-")[0])
            bill["subjects"] = subjects

            # add bill to collection
            self.save_bill(bill)
Example #28
0
    def parse_bill(self, chamber, session, special, link):
        bill_number = link.contents[0]
        type = re.search('type=(B|R|)', link['href']).group(1)
        bill_id = "%s%s %s" % (bill_abbr(chamber), type, bill_number)

        bill_info_url = info_url(chamber, session, special, type, bill_number)

        with self.urlopen(bill_info_url) as info_page:
            info_page = BeautifulSoup(info_page)
            title_label = info_page.find(text='Short Title:')
            title = title_label.findNext().contents[0]

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(bill_info_url)

            self.parse_bill_versions(bill, info_page)

            self.parse_history(bill, history_url(chamber, session, special,
                                                 type, bill_number))

            self.parse_votes(bill, vote_url(chamber, session, special,
                                            type, bill_number))

            self.save_bill(bill)
Example #29
0
    def scrape_assem_bills(self, chamber, insert, session):
        
        doc_type = [1, 3, 5, 6]
        for doc in doc_type:
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, doc)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                with self.urlopen(page_path) as page:
                    root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')
                    bill = Bill(session, chamber, bill_id, title)

                    primary, secondary = self.scrape_sponsors(page_path)
                    
                    if primary[0] == 'By:':
                        primary.pop(0)
                        
                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    self.scrape_actions(page_path, bill, "Assembly")
                    self.scrape_votes(page_path, bill, "Assembly", insert, title)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Example #30
0
    def scrape_bill(self, chamber, session, bill_number, ga_num):
        bill_url = self.urls['info'] % (bill_number, ga_num)

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            title = page.xpath("//span[@id='lblAbstract']")[0].text
            
            bill = Bill(session, chamber, bill_number, title)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                bill.add_action(chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)