Exemple #1
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution'}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                    if insert.find('Special') != -1:
                        session = insert
                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type)
                    bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)


                    primary, secondary = self.scrape_sponsors(page)
                    
                    if primary[0] == 'By:':
                        primary.pop(0)
                        
                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    minutes_count = 2
                    for mr in root.xpath('//table[4]/tr/td[3]/a'):
                        minutes =  mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1


                    self.scrape_actions(root, bill, "lower")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Exemple #2
0
    def scrape_bill(self, session, chamber, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                return
            title = title.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Exemple #3
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billindex/"
               "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev))
        page = lxml.html.fromstring(self.urlopen(url))

        for tr in page.xpath("//tr[@valign='middle']")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') +
                      tr.xpath('td[10]//a')):
                bill.add_version(a.text, a.get('href'))

            # documents
            fnote = tr.xpath('td[7]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[12]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemple #4
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'], chamber, data['identifier'],
                    data['title'], subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}),
                            )

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(sponsor['classification'],
                             sponsor['name'],
                             )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'], link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'], link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'],
                               chamber
                               )
        self.save_bill(bill)
Exemple #5
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Exemple #6
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billindex/"
               "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev))
        page = lxml.html.fromstring(self.urlopen(url))

        for tr in page.xpath("//tr[@valign='middle']")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') +
                      tr.xpath('td[10]//a')):
                bill.add_version(a.text, a.get('href'))

            # documents
            fnote = tr.xpath('td[7]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[12]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemple #7
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {
            1: 'bill',
            3: 'resolution',
            5: 'concurrent resolution',
            6: 'joint resolution'
        }
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (
                insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (
                    insert, link)
                page = self.urlopen(page_path)
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath(
                    'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)'
                )
                title = root.xpath(
                    'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                bill = Bill(session, chamber, bill_id, title, type=bill_type)
                bill['subjects'] = self.subject_mapping[bill_id]
                bill_text = root.xpath(
                    "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)"
                )
                text_url = "http://www.leg.state.nv.us" + bill_text
                bill.add_version("Bill Text",
                                 text_url,
                                 mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[
                        1] + minutes_date[2] + " Minutes"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "lower")
                self.scrape_votes(page, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemple #8
0
    def scrape_bill(self, session, chamber, bill_id, short_title, url):
        if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']:
            return

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            # check for Bill Withdrawn header
            h1text = page.xpath('//h1/text()')
            if h1text and h1text[0] == 'Bill Withdrawn':
                return

            title = page.xpath("//br")[8].tail
            if not title:
                title = short_title
            title = title.strip()

            abbrev = bill_id.split()[0]
            if abbrev.endswith('B'):
                bill_type = ['bill']
            elif abbrev.endswith('JR'):
                bill_type = ['joint resolution']
            elif abbrev.endswith('CR'):
                bill_type = ['concurrent resolution']
            elif abbrev.endswith('R'):
                bill_type = ['resolution']

            bill = Bill(session, chamber, bill_id, title,
                        type=bill_type)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"):
                self.scrape_senate_vote(bill, vote_link.attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"):
                self.scrape_house_vote(bill, vote_link.attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Exemple #9
0
    def scrape_bill(self, term, bill_url):

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            
            chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text
            
            if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0:
            
                chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text

                if '*' in chamber1:
                    bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
                    secondary_bill_id = chamber2.replace(' ', '')
                else:
                    bill_id = chamber2.replace(' ', '')[1:len(chamber2)]
                    secondary_bill_id = chamber1.replace(' ', '')
                
                primary_chamber = 'lower' if 'H' in bill_id else 'upper'

            else:
                primary_chamber = 'lower' if 'H' in chamber1 else 'upper'
                bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
                secondary_bill_id = None
            
            title = page.xpath("//span[@id='lblAbstract']")[0].text

            bill = Bill(term, primary_chamber, bill_id, title, secondary_bill_id=secondary_bill_id)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                #NEED TO ADD SECONDARY ACTIONS
                bill.add_action(primary_chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)
Exemple #10
0
    def scrape_bill_page(self, chamber, session, bill_url, bill_type):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()")

        sbp = lambda x: self.scrape_bare_page(page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"])

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr")

        bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]]
        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsor("primary", author)
        for author in authors:
            bill.add_sponsor("cosponsor", author)

        for digest in digests:
            bill.add_document(digest.text, digest.attrib["href"], mimetype="application/pdf")

        for version in versions:
            bill.add_version(version.text, version.attrib["href"], mimetype="application/pdf")

        flags = {"prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"]}

        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            date += "/%s" % (session)  # Session is April --> June. Prefiles
            # look like they're in January at earliest.
            date = dt.datetime.strptime(date, "%m/%d/%Y")
            chamber = {"S": "upper", "H": "lower", "J": "joint"}[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            if cat == []:
                cat = ["other"]
            bill.add_action(chamber, text, date, cat)

        self.save_bill(bill)
Exemple #11
0
    def scrape_bill(self, session, chamber, bill_id, short_title, url):
        if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']:
            return

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            # check for Bill Withdrawn header
            h1text = page.xpath('//h1/text()')
            if h1text and h1text[0] == 'Bill Withdrawn':
                return

            title = page.xpath("//br")[8].tail
            if not title:
                title = short_title
            title = title.strip()

            abbrev = bill_id.split()[0]
            if abbrev.endswith('B'):
                bill_type = ['bill']
            elif abbrev.endswith('JR'):
                bill_type = ['joint resolution']
            elif abbrev.endswith('CR'):
                bill_type = ['concurrent resolution']
            elif abbrev.endswith('R'):
                bill_type = ['resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"):
                self.scrape_senate_vote(bill, vote_link.attrib['href'])

            for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"):
                self.scrape_house_vote(bill, vote_link.attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Exemple #12
0
    def scrape_bill(self, chamber, term, bill_id, url, title, subject=None):
        self.logger.info('GET ' + url)
        resp = self.get(url)
        html = resp.text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        bill = Bill(term, chamber, bill_id, title)
        bill.add_source(url)
        if subject is not None:
            bill['subjects'] = [subject]

        # Sponsors
        sponsor_map = {
            'author': 'primary',
            'co-author': 'cosponsor',
            'sponsor': 'cosponsor',
            'co-sponsor': 'cosponsor',
        }
        for div in doc.xpath('//div[contains(@class, "bill-author-info")]'):
            name = div.xpath('string(b)').strip()
            sp_type = sponsor_map[div.xpath('string(p)').strip().lower()]
            bill.add_sponsor(sp_type, name)

        # Actions
        for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]:
            if li.text_content() == 'None currently available.':
                continue
            chamber_str = li.xpath('string(strong)').strip()
            action_chamber = dict(H='lower', S='upper')[chamber_str]
            action_date = li.xpath('string(span[@class="document-date"])')
            action_date = datetime.datetime.strptime(action_date.strip(),
                                                     '%m/%d/%Y')
            action_text = li.xpath('string(span[2])').strip()
            if not action_text.strip():
                continue
            kwargs = dict(date=action_date,
                          actor=action_chamber,
                          action=action_text)
            kwargs.update(**self.categorizer.categorize(action_text))
            bill.add_action(**kwargs)

        # Documents (including votes)
        for doc_type, doc_meta in BillDocuments(self, doc):
            if doc_type == 'version':
                bill.add_version(doc_meta.title or doc_meta.text,
                                 url=doc_meta.url,
                                 mimetype='application/pdf')
            elif doc_type == 'document':
                bill.add_document(doc_meta.title or doc_meta.text,
                                  url=doc_meta.url,
                                  mimetype='application/pdf')
            elif doc_type == 'rollcall':
                self.add_rollcall(chamber, bill, doc_meta)

        self.save_bill(bill)
Exemple #13
0
    def scrape_current(self, chamber, term):
        chamber_name = "Senate" if chamber == "upper" else "House"
        with self.urlopen(
            ksapi.url + "bill_status/"
        ) as bill_request:  # perhaps we should save this data so we can make on request for both chambers?
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json["content"]
            for bill_data in bills:
                # filtering out other chambers
                bill_equal_chamber = False
                for history in bill_data["HISTORY"]:
                    if history["chamber"] == chamber_name:
                        bill_is_in_chamber = True
                if not bill_is_in_chamber:
                    continue

                    # main
                bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"])
                bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower())
                if bill_data["LONGTITLE"]:
                    bill.add_title(bill_data["LONGTITLE"])
                bill.add_document("apn", ksapi.ksleg + bill_data["apn"])
                bill.add_version("Latest", ksapi.ksleg + bill_data["apn"])

                for sponsor in bill_data["SPONSOR_NAMES"]:
                    bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor)

                for event in bill_data["HISTORY"]:
                    if "committee_names" in event and "conferee_names" in event:
                        actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"])
                    elif "committee_names" in history:
                        actor = " and ".join(bill_data["committee_names"])
                    elif "conferee_names" in history:
                        actor = " and ".join(bill_data["conferee_names"])
                    else:
                        actor = "upper" if chamber == "Senate" else "lower"

                    date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S")
                    bill.add_action(actor, event["status"], date)

                    if event["action_code"] in ksapi.voted:
                        votes = votes_re.match(event["status"])
                        if votes:
                            vote = Vote(
                                chamber,
                                date,
                                votes.group(1),
                                event["action_code"] in ksapi.passed,
                                int(votes.group(2)),
                                int(votes.group(3)),
                                0,
                            )
                            vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower())
                            bill.add_vote(vote)

                self.save_bill(bill)
Exemple #14
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = self.urlopen(url)
        except scrapelib.HTTPError:
            self.warning("couldn't open %s, skipping bill" % url)
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        header = page.xpath('//h3/br')[0].tail.replace(' ', ' ')
        title, primary_sponsor = header.split(' -- ')

        if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
            bill_type = ['bill']
        elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
            bill_type = ['resolution']
        elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
            bill_type = ['concurrent resolution']
        elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
            bill_type = ['joint resolution']

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub("\s+", " ", bill_id).strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_sponsor('primary', primary_sponsor)
        bill.add_source(url)

        for link in page.xpath(
                '//a[contains(@href, "bills/") and text() = "HTML"]'):

            name = link.getprevious().tail.strip()
            bill.add_version(name, link.attrib['href'], mimetype="text/html")
            next = link.getnext()
            if next.text == "PDF":
                bill.add_version(name,
                                 next.attrib['href'],
                                 mimetype="application/pdf")

        for link in page.xpath(
                "//a[contains(@href, 'fnotes') and text() = 'HTML']"):

            bill.add_document("Fiscal Note", link.attrib['href'])

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill['subjects'] = subjects

        status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
        self.parse_status(bill, status_link.attrib['href'])

        self.save_bill(bill)
Exemple #15
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution',9:'petition'}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)
                root.make_links_absolute("http://www.leg.state.nv.us/")

                bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(session, chamber, bill_id, title,
                            type=bill_type)
                bill['subjects'] = list(set(self.subject_mapping[bill_id]))
                billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
                text_urls = billtext.xpath("./a")
                for text_url in text_urls:
                    version_name = text_url.text.strip()
                    version_url = text_url.attrib['href']
                    bill.add_version(version_name, version_url,
                                 mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes =  mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1


                self.scrape_actions(root, bill, "lower")
                self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemple #16
0
    def scrape_bill(self, chamber, term, bill_id, url, title, subject=None):
        self.logger.info('GET ' + url)
        resp = self.get(url)
        html = resp.text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        bill = Bill(term, chamber, bill_id, title)
        bill.add_source(url)
        if subject is not None:
            bill['subjects'] = [subject]

        # Sponsors
        sponsor_map = {
            'author': 'primary',
            'co-author': 'cosponsor',
            'sponsor': 'cosponsor',
            'co-sponsor': 'cosponsor',
            }
        for div in doc.xpath('//div[contains(@class, "bill-author-info")]'):
            name = div.xpath('string(b)').strip()
            sp_type = sponsor_map[div.xpath('string(p)').strip().lower()]
            bill.add_sponsor(sp_type, name)

        # Actions
        for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]:
            if li.text_content() == 'None currently available.':
                continue
            chamber_str = li.xpath('string(strong)').strip()
            action_chamber = dict(H='lower', S='upper')[chamber_str]
            action_date = li.xpath('string(span[@class="document-date"])')
            # Some resolution actions have no dates.
            if not action_date.strip():
                continue
            action_date = datetime.datetime.strptime(action_date.strip(), '%m/%d/%Y')
            action_text = li.xpath('string(span[2])').strip()
            if not action_text.strip():
                continue
            kwargs = dict(date=action_date, actor=action_chamber, action=action_text)
            kwargs.update(**self.categorizer.categorize(action_text))
            bill.add_action(**kwargs)

        # Documents (including votes)
        for doc_type, doc_meta in BillDocuments(self, doc):
            if doc_type == 'version':
                bill.add_version(
                    doc_meta.title or doc_meta.text, url=doc_meta.url,
                    mimetype='application/pdf')
            elif doc_type == 'document':
                bill.add_document(doc_meta.title or doc_meta.text, url=doc_meta.url,
                    mimetype='application/pdf')
            elif doc_type == 'rollcall':
                self.add_rollcall(chamber, bill, doc_meta)

        self.save_bill(bill)
Exemple #17
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (
                insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.urlopen(page_path)
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath(
                    'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                title = root.xpath(
                    'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                bill = Bill(session, chamber, bill_id, title,
                            type=bill_type)
                bill['subjects'] = self.subject_mapping[bill_id]

                bill_text = root.xpath(
                    "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                text_url = "http://www.leg.state.nv.us" + bill_text
                bill.add_version("Bill Text", text_url,
                                 mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + \
                        minutes_date[1] + minutes_date[2] + " Agenda"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                self.scrape_votes(page, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemple #18
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = self.urlopen(url)
        except scrapelib.HTTPError:
            self.warning("couldn't open %s, skipping bill" % url)
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        header = page.xpath('//h3/br')[0].tail.replace(' ', ' ')
        title, primary_sponsor = header.split(' -- ')

        if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
            bill_type = ['bill']
        elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
            bill_type = ['resolution']
        elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
            bill_type = ['concurrent resolution']
        elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
            bill_type = ['joint resolution']

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub("\s+", " ", bill_id).strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_sponsor('primary', primary_sponsor)
        bill.add_source(url)

        for link in page.xpath(
                '//a[contains(@href, "bills/") and text() = "HTML"]'):

            name = link.getprevious().tail.strip()
            bill.add_version(name, link.attrib['href'], mimetype="text/html")
            next = link.getnext()
            if next.text == "PDF":
                bill.add_version(name, next.attrib['href'],
                                 mimetype="application/pdf")

        for link in page.xpath(
                "//a[contains(@href, 'fnotes') and text() = 'HTML']"):

            bill.add_document("Fiscal Note", link.attrib['href'])

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill['subjects'] = subjects

        status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
        self.parse_status(bill, status_link.attrib['href'])

        self.save_bill(bill)
Exemple #19
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(session, chamber, bill_id, title,
                            type=bill_type)
                bill['subjects'] = list(set(self.subject_mapping[bill_id]))

                for table in root.xpath('//div[@id="content"]/table'):
                    if 'Bill Text' in table.text_content():
                        bill_text = table.xpath("string(tr/td[2]/a/@href)")
                        text_url = "http://www.leg.state.nv.us" + bill_text
                        bill.add_version("Bill Text", text_url,
                                         mimetype='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsor('primary', leg)
                for leg in secondary:
                    bill.add_sponsor('cosponsor', leg)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes =  mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
                    bill.add_document(minutes_date, minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                self.save_bill(bill)
Exemple #20
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = '%s/bill.doc' % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext('p').contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version",
                             session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile(
                    'legislator/[SH]\d+\.htm'))

            for sponsor_link in sponsor_links:
                bill.add_sponsor('primary', sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext('p')[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = action.split('-')[0]
                action_date = dt.datetime.strptime(action_date, '%b %d')
                # Fix:
                action_date = action_date.replace(
                    year=int('20' + session[2:4]))

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
            if vote_link:
                bill.add_document(
                    'vote_history.pdf',
                    bill_info_url.replace('.htm', '') + "/vote_history.pdf")

            self.save_bill(bill)
Exemple #21
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']))
            # TODO: related entities

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']))

        for title in data['other_titles']:
            bill.add_title(title)

        # TODO: related bills
        # for related in data['related_bills']:

        self.save_bill(bill)
Exemple #22
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: "bill", 3: "resolution", 5: "concurrent resolution", 6: "joint resolution"}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = "http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s" % (
                insert,
                docnum,
            )
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = "http://www.leg.state.nv.us/Session/%s/Reports/%s" % (insert, link)
                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                    bill = Bill(session, chamber, bill_id, title, type=bill_type)
                    bill["subjects"] = self.subject_mapping[bill_id]
                    bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)

                    primary, secondary = self.scrape_sponsors(page)

                    for leg in primary:
                        bill.add_sponsor("primary", leg)
                    for leg in secondary:
                        bill.add_sponsor("cosponsor", leg)

                    minutes_count = 2
                    for mr in root.xpath("//table[4]/tr/td[3]/a"):
                        minutes = mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1

                    self.scrape_actions(root, bill, "lower")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Exemple #23
0
    def scrape_bill(self, session, chamber, bill_id, short_title, url):
        if bill_id == "SCR 0003":
            return

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                title = short_title
            title = title.strip()

            abbrev = bill_id.split()[0]
            if abbrev.endswith("B"):
                bill_type = ["bill"]
            elif abbrev.endswith("JR"):
                bill_type = ["joint resolution"]
            elif abbrev.endswith("CR"):
                bill_type = ["concurrent resolution"]
            elif abbrev.endswith("R"):
                bill_type = ["resolution"]

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib["href"])

            version_path = "//a[contains(., '%s')]"
            for version_type in ("Introduced Bill", "House Bill", "Senate Bill", "Engrossed Bill", "Enrolled Act"):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib["href"])

            for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"):
                self.scrape_senate_vote(bill, vote_link.attrib["href"])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib["href"])

            bill["subjects"] = self.subjects[bill_id]

            self.save_bill(bill)
Exemple #24
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            bill_no = 1
            abbr = 'SB'
        else:
            bill_no = 4001
            abbr = 'HB'
        while True:
            bill_page = self.scrape_bill(session, abbr, bill_no)
            bill_page = BeautifulSoup(bill_page)
            # if we can't find a page, we must be done. This is a healthy thing.
            if bill_page == None: return
            title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0]))
            title = title.replace('\n','').replace('\r','')
            bill_id = "%s %d" % (abbr, bill_no)

            the_bill = Bill(session, chamber, bill_id, title)

            #sponsors
            first = 0
            for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'):
                the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string)
                first = 1

            #versions
            for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'):
                r = self.parse_doc(the_bill, doc)
                if r: the_bill.add_version(*r)

            #documents
            if 'frg_billstatus_HlaTable' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)
            if 'frg_billstatus_SfaSection' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)

            self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0])
            self.save_bill(the_bill)
            bill_no = bill_no + 1
        pass
Exemple #25
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        bill = Bill(data['legislative_session'], chamber, data['identifier'],
                    data['title'], subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(action['organization_id'])['classification']
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']))
            # TODO: related entities

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(sponsor['classification'],
                             sponsor['name'],
                             )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'], link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'], link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']))

        for title in data['other_titles']:
            bill.add_title(title)

        # TODO: related bills
        # for related in data['related_bills']:

        self.save_bill(bill)
Exemple #26
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = self.urlopen(url)
        except scrapelib.HTTPError:
            self.warning("couldn't open %s, skipping bill" % url)
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        header = page.xpath("//h3/br")[0].tail.replace(" ", " ")
        title, primary_sponsor = header.split(" -- ")

        if bill_id.startswith("H.B.") or bill_id.startswith("S.B."):
            bill_type = ["bill"]
        elif bill_id.startswith("H.R.") or bill_id.startswith("S.R."):
            bill_type = ["resolution"]
        elif bill_id.startswith("H.C.R.") or bill_id.startswith("S.C.R."):
            bill_type = ["concurrent resolution"]
        elif bill_id.startswith("H.J.R.") or bill_id.startswith("S.J.R."):
            bill_type = ["joint resolution"]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_sponsor("primary", primary_sponsor)
        bill.add_source(url)

        for link in page.xpath('//a[contains(@href, "bills/") and text() = "HTML"]'):

            name = link.getprevious().tail.strip()
            bill.add_version(name, link.attrib["href"])

        for link in page.xpath("//a[contains(@href, 'fnotes') and text() = 'HTML']"):

            bill.add_document("Fiscal Note", link.attrib["href"])

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill["subjects"] = subjects

        status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
        self.parse_status(bill, status_link.attrib["href"])

        self.save_bill(bill)
Exemple #27
0
    def scrape_bill(self, chamber, session, bill_number, ga_num):
        bill_url = self.urls['info'] % (bill_number, ga_num)

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            title = page.xpath("//span[@id='lblAbstract']")[0].text

            bill = Bill(session, chamber, bill_number, title)
            bill.add_source(bill_url)

            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']"
                                 )[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*', '').strip()
            bill.add_sponsor('primary', sponsor)

            # Co-sponsors unavailable for scraping (loaded into page via AJAX)

            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))

            # Actions
            tables = page.xpath(
                "//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']"
            )
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(
                    ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                bill.add_action(chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if (len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(
                    bill, sponsor,
                    'http://wapp.capitol.tn.gov/apps/Billinfo/%s' %
                    (votes_link, ))

            self.save_bill(bill)
Exemple #28
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath(
                "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version(a.text,
                                 a.get('href'),
                                 mimetype='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemple #29
0
    def scrape_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            header = page.xpath('//h3/br')[0].tail.replace(' ', ' ')
            title, primary_sponsor = header.split(' -- ')

            if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
                bill_type = ['bill']
            elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
                bill_type = ['resolution']
            elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
                bill_type = ['concurrent resolution']
            elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
                bill_type = ['joint resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_sponsor('primary', primary_sponsor)
            bill.add_source(url)

            for link in page.xpath(
                    '//a[contains(@href, "bills/") and text() = "HTML"]'):

                name = link.getprevious().tail.strip()
                bill.add_version(name, link.attrib['href'])

            for link in page.xpath(
                    "//a[contains(@href, 'fnotes') and text() = 'HTML']"):

                bill.add_document("Fiscal Note", link.attrib['href'])

            subjects = []
            for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
                subjects.append(link.text.strip())
            bill['subjects'] = subjects

            status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
            self.parse_status(bill, status_link.attrib['href'])

            self.save_bill(bill)
Exemple #30
0
    def scrape_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            header = page.xpath('//h3/br')[0].tail.replace(' ', ' ')
            title, primary_sponsor = header.split(' -- ')

            if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
                bill_type = ['bill']
            elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
                bill_type = ['resolution']
            elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
                bill_type = ['concurrent resolution']
            elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
                bill_type = ['joint resolution']

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_sponsor('primary', primary_sponsor)
            bill.add_source(url)

            for link in page.xpath(
                '//a[contains(@href, "bills/") and text() = "HTML"]'):

                name = link.getprevious().tail.strip()
                bill.add_version(name, link.attrib['href'])

            for link in page.xpath(
                "//a[contains(@href, 'fnotes') and text() = 'HTML']"):

                bill.add_document("Fiscal Note", link.attrib['href'])

            subjects = []
            for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
                subjects.append(link.text.strip())
            bill['subjects'] = subjects

            status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
            self.parse_status(bill, status_link.attrib['href'])

            self.save_bill(bill)
Exemple #31
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath("//table[@id='ctl00_cphContent_gvBills']//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version(a.text, a.get('href'),
                                 mimetype='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemple #32
0
    def scrape_bill(self, chamber, session, bill_number, ga_num):
        bill_url = self.urls['info'] % (bill_number, ga_num)

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            title = page.xpath("//span[@id='lblAbstract']")[0].text
            
            bill = Bill(session, chamber, bill_number, title)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                bill.add_action(chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)
Exemple #33
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        with self.urlopen(url) as bill_dir_page:
            root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser())
            for mr in root.xpath('//lastaction/msrgroup'):
                bill_id = mr.xpath('string(measure)').replace(" ", "")
                if bill_id[0] == "S":
                    chamber = "upper"
                else:
                    chamber = "lower"

                bill_type = {'B':'bill', 'C': 'concurrent resolution',
                             'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

                # just skip past bills that are of the wrong chamber
                if chamber != chamber_to_scrape:
                    continue

                link = mr.xpath('string(actionlink)').replace("..", "")
                main_doc = mr.xpath('string(measurelink)').replace("../../../", "")
                main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
                bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link)
                with self.urlopen(bill_details_url) as details_page:
                    details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                    details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                    title = details_root.xpath('string(//shorttitle)')
                    longtitle = details_root.xpath('string(//longtitle)')

                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type, longtitle=longtitle)

                    #sponsors
                    main_sponsor = details_root.xpath('string(//p_name)').split()
                    if main_sponsor:
                        main_sponsor = main_sponsor[0]
                        main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_")
                        main_sponsor_url =  'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link)
                        type = "primary"
                        bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url)
                    for author in details_root.xpath('//authors/additional'):
                        leg = author.xpath('string(co_name)').replace(" ", "_")
                        if leg:
                            leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg)
                            type = "cosponsor"
                            bill.add_sponsor(type, leg, leg_url=leg_url)

                    #Versions 
                    curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "")
                    curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version
                    bill.add_version("Current version", curr_version_url)

                    intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "")
                    intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version
                    bill.add_version("As Introduced", intro_version_url)

                    comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "")
                    if comm_version.find("documents") != -1:
                        comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                        bill.add_version("Committee Substitute", comm_version_url)

                    passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "")
                    if passed_version.find("documents") != -1:
                        passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                        title = "As Passed the " + chamber
                        bill.add_version(title, passed_version_url)

                    asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "")
                    if asg_version.find("documents") != -1:
                        asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                        bill.add_version("Approved by the Governor", asg_version_url)

                    # avoid duplicate votes
                    seen_votes = set()

                    #Actions
                    for action in details_root.xpath('//history/action'):
                        action_num  = action.xpath('string(act_number)').strip()
                        action_num = int(action_num)
                        act_vote = action.xpath('string(act_vote)').replace("../../../..", "")
                        action_desc = action.xpath('string(act_desc)')
                        date, action_desc = action_desc.split(" ", 1)
                        date = date + "/" + session[0:4]
                        date = datetime.strptime(date, "%m/%d/%Y")

                        if action_desc.startswith("(H)"):
                            actor = "lower"
                            action = action_desc[4:]
                        elif action_desc.startswith("(S)"):
                            actor = "upper"
                            action = action_desc[4:]
                        else:
                            actor = "executive"
                            action = action_desc

                        if action.find("Veto") != -1:
                            version_path = details_root.xpath("string(//veto_other)")
                            version_path = version_path.replace("../../../../", "")
                            version_url = "http://billstatus.ls.state.ms.us/" + version_path
                            bill.add_document("Veto", version_url) 

                        atype = 'other'
                        for prefix, prefix_type in self._action_types:
                            if action.startswith(prefix):
                                atype = prefix_type
                                break

                        bill.add_action(actor, action, date, type=atype,
                                        action_num=action_num)

                        if act_vote:
                            vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                            if vote_url not in seen_votes:
                                seen_votes.add(vote_url)
                                vote = self.scrape_votes(vote_url, action,
                                                         date, actor)
                                vote.add_source(vote_url)
                                bill.add_vote(vote)

                    bill.add_source(bill_details_url)
                    self.save_bill(bill)
Exemple #34
0
    def scrape_bill(self, session, chamber, bill_id, short_title, url):

        try:
            page = self.urlopen(url)
        except scrapelib.HTTPError:
            self.logger.warning("500 error at: %r" % url)
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # check for Bill Withdrawn header
        h1text = page.xpath("//h1/text()")
        if h1text and h1text[0] == "Bill Withdrawn":
            return

        title = page.xpath("//br")[8].tail
        if not title:
            title = short_title
        title = title.strip()

        abbrev = bill_id.split()[0]
        if abbrev.endswith("B"):
            bill_type = ["bill"]
        elif abbrev.endswith("JR"):
            bill_type = ["joint resolution"]
        elif abbrev.endswith("CR"):
            bill_type = ["concurrent resolution"]
        elif abbrev.endswith("R"):
            bill_type = ["resolution"]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
        self.scrape_actions(bill, action_link.attrib["href"])

        version_path = "//a[contains(., '%s')]"
        for version_type in ("Introduced Bill", "House Bill", "Senate Bill", "Engrossed Bill", "Enrolled Act"):
            path = version_path % version_type
            links = page.xpath(path)
            if links:

                _url = links[0].attrib["href"]

                # Set the mimetype.
                if "pdf" in _url:
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"

                bill.add_version(version_type, _url, mimetype=mimetype)

        for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"):
            self.scrape_senate_vote(bill, vote_link.attrib["href"])

        for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"):
            self.scrape_house_vote(bill, vote_link.attrib["href"])

        for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
            num = doc_link.text.strip().split("(")[0]
            bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib["href"])

        bill["subjects"] = self.subjects[bill_id]

        # Also retrieve the "latest printing" bill if it hasn't
        # been found yet.
        latest_printing = '//a[contains(@href, "bills")]/@href'
        for url in set(page.xpath(latest_printing)):

            # Set the mimetype.
            if "pdf" in url:
                mimetype = "application/pdf"
            else:
                mimetype = "text/html"

            try:
                bill.add_version("Latest printing", url, mimetype=mimetype)
            except ValueError:
                # The url was a duplicate.
                pass

        if not bill["sponsors"]:

            # Indiana has so-called 'vehicle bills', which are empty
            # placeholders that may later get injected with content
            # concerning such innocuous topics as redistricting
            # (2011 SB 0192) and marijuana studies (2011 SB 0192).
            url = bill["sources"][0]["url"]
            page = self.urlopen(url)
            if "Vehicle Bill" in page:
                msg = "Skipping vehicle bill: {bill_id}."
                self.logger.info(msg.format(**bill))
                return

            # And some bills are withdrawn before first reading, which
            # case they don't really exist, and the main version link
            # will 404.
            withdrawn = "Withdrawn prior to first reading"
            if bill["actions"]:
                if bill["actions"][-1]["action"] == withdrawn:
                    msg = "Skipping bill withdrawn before first " "reading: {bill_id}."
                    self.logger.info(msg.format(**bill))
                    return

        self.save_bill(bill)
Exemple #35
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}))

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'], chamber)

        bill['alternate_bill_ids'] = [
            oi['identifier'] for oi in data['other_identifiers']
        ]
        self.save_bill(bill)
Exemple #36
0
    def parse_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            try:
                short_bill_id = re.sub(r'S([JC])R', r'S\1', bill_id)

                version_link = page.xpath(
                    "//a[contains(@href, '%s/bill.doc')]" % short_bill_id)[0]
            except IndexError:
                # Bill withdrawn
                return

            pars = version_link.xpath("following-sibling::p")
            if len(pars) == 2:
                title = pars[0].xpath("string()")
                action_p = pars[1]
            else:
                title = pars[0].getprevious().tail
                action_p = pars[0]

            title = re.sub(ur'[\s\xa0]+', ' ', title).strip()

            if 'CR' in bill_id:
                bill_type = 'concurrent resolution'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill['subjects'] = self._subjects[bill_id]
            bill.add_source(url)

            bill.add_version("Most Recent Version",
                             version_link.attrib['href'])

            for link in page.xpath("//a[contains(@href, 'legislator/')]"):
                bill.add_sponsor('primary', link.text.strip())

            for line in action_p.xpath("string()").split("\n"):
                action = line.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = "%s %s" % (action.split('-')[0],
                                         session[0:4])
                action_date = datetime.datetime.strptime(
                    action_date, '%b %d %Y')

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                atype = []
                if action.startswith('introduced in'):
                    atype.append('bill:introduced')
                    if '; to ' in action:
                        atype.append('committee:referred')
                elif action.startswith('signed by Governor'):
                    atype.append('governor:signed')
                elif re.match(r'^to [A-Z]', action):
                    atype.append('committee:referred')
                elif action == 'adopted by voice vote':
                    atype.append('bill:passed')

                if '1st reading' in action:
                    atype.append('bill:reading:1')
                if '3rd reading' in action:
                    atype.append('bill:reading:3')
                if '2nd reading' in action:
                    atype.append('bill:reading:2')

                if 'R' in bill_id and 'adopted by voice vote' in action:
                    atype.append('bill:passed')

                amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*'
                                r'( and \([a-z\d\-]+\))? filed')
                if re.search(amendment_re, action):
                    atype.append('amendment:introduced')

                if not atype:
                    atype = ['other']

                bill.add_action(actor, action, action_date, type=atype)

            try:
                votes_link = page.xpath(
                    "//a[contains(@href, 'vote_history.pdf')]")[0]
                bill.add_document("Vote History",
                                  votes_link.attrib['href'])
            except IndexError:
                # No votes
                pass

            self.save_bill(bill)
Exemple #37
0
    def scrape_bill(self, link, chamber, session):
        legislation_types = {
            'House Bill': 'HB',
            'House Concurrent Resolution': 'HCR',
            'House Joint Resolution': 'HJR',
            'House Resolution': 'HR',
            'Senate Bill': 'SB',
            'Senate Concurrent Resolution': 'SCR',
            'Senate Joint Resolution': 'SJR',
            'Senate Resolution': 'SR',
        }

        base_url = "http://legis.delaware.gov"
        text_base_url = "http://legis.delaware.gov/LIS/lis{session}.nsf/vwLegislation/{bill_id}/$file/legis.html?open"
        try:
            page = self.lxmlize(link, True)
        except requests.exceptions.HTTPError:
            self.logger.warning('404. Apparently the bill hasn\'t been posted')
            return
        nominee = self.get_node(page, './/div[@id="page_header"]/text()')
        if nominee is not None and nominee.strip().lower(
        ) == "nominee information":
            self.logger.info("Nominee, skipping")
            return

        bill_id = self.get_node(
            page, './/div[@align="center" or @style="text-align:center"]')
        try:
            bill_id = bill_id.text_content().strip()
        except IndexError:
            self.logger.warning("Can't find bill number, skipping")
            return

        #some bill_ids include relevant amendments
        #in the form "SB 10 w/SA1", so we fix it here
        bill_id = bill_id.split("w/")[0]
        bill_id = bill_id.split("(")[0]

        leg_type = None
        for long_name, short_name in legislation_types.items():
            if long_name in bill_id:
                leg_type = short_name
                bill_num = bill_id.replace(long_name, "").strip()
                break
        if leg_type:
            bill_id = leg_type + " " + bill_num
        elif "for" in bill_id:
            bill_id = bill_id.split("for")[1]
        else:
            self.logger.warning("Unknown bill type for {}".format(bill_id))
            return

        bill_id = bill_id.replace('&nbsp', "")
        bill_id = bill_id.strip()

        #each row is in its own table
        #there are no classes/ids or anything, so we're going to loop
        #through the individual tables and look for keywords
        #in the first td to tell us what we're looking at
        tables = self.get_nodes(page, './/div[@id="page_content"]/table')

        bill_title = None
        primary_sponsors = []
        cosponsors = []
        bill_url = None
        bill_documents = {}
        action_list = []
        vote_documents = {}
        sub_link = None
        bill_text_avail = False

        if tables is None or not tables:
            self.logger.warning('First xpath didn\'t work.')
            tables = self.get_nodes(page,
                                    './/table[@style="width:837.0px"]/tr')

        for table in tables:
            tds = table.xpath('.//td')
            if len(tds) == 0:
                #some kind of empty table for formatting reasons
                continue
            title_text = tds[0].text_content().strip().lower()

            if title_text.startswith('primary sponsor'):
                pri_sponsor_text = tds[1].text_content()
                primary_sponsors = self.separate_names(pri_sponsor_text)
                #sometimes additional sponsors are in a 3rd td
                #other times the 3rd td contains a blank image
                addl_sponsors = []
                add_spons_text = tds[2].text_content().strip()
                if add_spons_text:
                    add_spons_text = add_spons_text.replace(
                        "Additional Sponsor(s):", "")
                    if not "on behalf of all representatives" in add_spons_text.lower(
                    ):
                        addl_sponsors = self.separate_names(add_spons_text)

            elif title_text.startswith('co-sponsor'):
                cosponsor_text = tds[1].text_content()
                if "none..." in cosponsor_text.lower():
                    cosponsors = []
                    continue
                cosponsors = self.separate_names(cosponsor_text)

            elif title_text.startswith('long title'):
                bill_title = tds[1].text_content().strip()

            elif title_text.startswith('amendment'):
                amendments = tds[1].xpath('.//a')
                for a in amendments:
                    amm = a.text
                    amm_text = "Amendment".format(amm.strip())
                    amm_slg = "+".join(amm.split())
                    amm_link = text_base_url.format(session=session,
                                                    bill_id=amm_slg)
                    bill_documents[amm_text] = amm_link
                    amm_page = self.lxmlize(a.attrib["href"])
                    for tr in amm_page.xpath('//tr'):
                        tds = tr.xpath("./td")
                        if len(tds) > 1:
                            if "voting" in tds[0].text_content().lower():
                                self.find_vote(tds, vote_documents,
                                               "Amendment: ")

            elif title_text.startswith('engrossed version'):
                if tds[1].text_content().strip():
                    engrossment_base = "http://legis.delaware.gov/LIS/lis{session}.nsf/EngrossmentsforLookup/{bill_id}/$file/Engross.html?open"
                    engrossment_link = engrossment_base.format(
                        session=session, bill_id="+".join(bill_id.split()))
                    if bill_url not in bill_documents.values():
                        bill_documents["Engrossed Version"] = engrossment_link

            elif title_text.startswith('substituted'):
                content = tds[1].text_content().strip()
                if ("Substitute" in content and not "Original" in content):
                    sub_link = tds[1].xpath(".//a/@href")[0]

            elif ("full text" in title_text
                  and ("(" not in title_text or "html" in title_text)):
                if tds[1].text_content().strip():
                    #it is totally unclear which version of the bill is referred to here
                    #so I'm just calling it "bill text"
                    bill_url = text_base_url.format(session=session,
                                                    bill_id=bill_id.replace(
                                                        " ", "+"))
                    if bill_url not in bill_documents.values():
                        bill_documents["Bill Text"] = bill_url

            elif title_text.startswith('fiscal notes'):
                pass
                #skipping fiscal notes for now, they are really ugly
                #but leaving in as a placeholder so we can remember to
                #do this someday, if we feel like it

            elif title_text.startswith('committee reports'):
                pass
                #the committee reports let a legislator
                #comment on a bill. They can comment as
                #"favorable","unfavorable" or "on its merits"
                #but these are NOT votes (per conversation w
                #seceretary of the DE senate 3/16/15). The bill is
                #considered if the majority sign it, which will
                #appear in the bill's action history as being
                #reported out of committee

            elif title_text.startswith('voting'):
                self.find_vote(tds, vote_documents)
            elif title_text.startswith('actions history'):
                action_list = tds[1].text_content().split("\n")

        sub_versions = []
        use_sub = False
        if sub_link:
            bill = self.scrape_bill(sub_link, chamber, session)
            if bill:
                sub_versions = [v["url"] for v in bill["versions"]]
                bill.add_title(bill_id)
                use_sub = True

        if not use_sub:
            bill = Bill(session, chamber, bill_id, bill_title)

            for s in primary_sponsors:
                bill.add_sponsor("primary", s)

            for s in addl_sponsors:
                #it is not totally clear whether "additional sponsors"
                #are co or primary but primary is my best guess
                #based on the bill text, bc they're on the first
                #line with the primary sponsor
                bill.add_sponsor("primary", s)

            for s in cosponsors:
                bill.add_sponsor("cosponsor", s)

        for name, doc_link in bill_documents.items():
            if "Engrossment" in name or "Bill Text" in name:
                if doc_link not in sub_versions:
                    bill.add_version(name, doc_link, mimetype="text/html")
            else:
                pass
                bill.add_document(name, doc_link, mimetype="text/html")

        for a in action_list:
            if a.strip():
                date, action = a.split('-', 1)
                try:
                    date = datetime.strptime(date.strip(), '%b %d, %Y')
                except ValueError:
                    date = datetime.strptime(date.strip(),
                                             '%B %d, %Y')  # XXX: ugh.
                action = action.strip()
                actor = actions.get_actor(action, bill['chamber'])
                attrs = dict(actor=actor, action=action, date=date)
                attrs.update(**self.categorizer.categorize(action))
                attrs["action"] = " ".join(attrs["action"].split())
                bill.add_action(**attrs)

        for name, doc in vote_documents.items():
            vote_chamber = "lower" if "house" in name.lower() else "upper"
            try:
                self.head(doc)
            except requests.exceptions.HTTPError:
                self.logger.warning("could not access vote document")
                continue
            vote_page = self.lxmlize(doc)
            vote_info = vote_page.xpath(".//div[@id='page_content']/p")[-1]
            yes_votes = []
            no_votes = []
            other_votes = []
            lines = vote_info.text_content().split("\n")
            for line in lines:
                if line.strip().startswith("Date"):
                    date_str = " ".join(line.split()[1:4])
                    date = datetime.strptime(date_str, "%m/%d/%Y %I:%M %p")
                    passage_status = line.strip().split()[-1]

                    #we've never seen a vote with anything but "passed"
                    #so throw an error otherwise so we can figure it out
                    passed_statuses = ["Passed"]
                    failed_statuses = ["Defeated", "Rescinded"]
                    if passage_status not in passed_statuses + failed_statuses:
                        raise AssertionError(
                            "Unknown passage state {}".format(passage_status))
                    passed = passage_status in passed_statuses

                if line.strip().startswith("Vote Type"):
                    if "voice" in line.lower():
                        voice_vote = True
                    else:
                        voice_vote = False
                        yes_count = int(re.findall("Yes: (\d+)", line)[0])
                        no_count = int(re.findall("No: (\d+)", line)[0])
                        other_count = int(
                            re.findall("Not Voting: (\d+)", line)[0])
                        other_count += int(
                            re.findall("Absent: (\d+)", line)[0])
                        vote_tds = vote_page.xpath(".//table//td")
                        person_seen = False
                        for td in vote_tds:
                            if person_seen:
                                person_vote = td.text_content().strip()
                                if person_vote == "Y":
                                    yes_votes.append(person)
                                elif person_vote == "N":
                                    no_votes.append(person)
                                elif person_vote in ["NV", "A", "X", "C"]:
                                    other_votes.append(person)
                                else:
                                    raise AssertionError(
                                        "Unknown vote '{}'".format(
                                            person_vote))
                                person_seen = False
                            else:
                                person = td.text_content().strip()
                                if person:
                                    person_seen = True

            if voice_vote:
                vote = Vote(vote_chamber, date, "passage", passed, 0, 0, 0)
            else:
                vote = Vote(vote_chamber,
                            date,
                            "passage",
                            passed,
                            yes_count,
                            no_count,
                            other_count,
                            yes_votes=[],
                            no_votes=[],
                            other_votes=[])

                vote["yes_votes"] = yes_votes
                vote["no_votes"] = no_votes
                vote["other_votes"] = other_votes

            if (passed and vote["yes_count"] <= vote["no_count"]
                    and not voice_vote):
                raise AssertionError("Vote passed with more N than Y votes?")

            if not passed and vote["yes_count"] > vote["no_count"]:
                self.logger.warning("Vote did not pass but had a majority \
                        probably worth checking")

            if "Amendment" in name:
                vote["type"] = "amendment"
            else:
                vote["type"] = "passage"
            vote.add_source(doc)
            bill.add_vote(vote)

        bill.add_source(link)

        return bill
Exemple #38
0
    def scrape_bill(self, url, kw,
                    re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'),
                    re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'),
                    re_digits=re.compile(r'\d{,5}'),
                    actions_get_actor=actions.get_actor):

        bill = Bill(**kw)
        bill.add_source(url)

        #---------------------------------------------------------------------
        # A few helpers.
        _url_2_lxml = self._url_2_lxml
        _cleanup_sponsors = self._cleanup_sponsors

        # Shortcut function partial to get text at a particular xpath:
        doc = _url_2_lxml(url)
        _get_text = partial(get_text, doc, 0)

        # Get session number--needed for fetching related documents (see below).
        xpath = '//font[contains(., "General Assembly") and @face="Arial"]'
        session_num = doc.xpath(xpath)[0].text_content()
        session_num = re_digits.match(session_num).group()

        #---------------------------------------------------------------------
        # Sponsors
        chamber = bill['chamber']

        sponsor_types = {
            'Additional Sponsor(s):': 'cosponsor',
            'CoSponsors:': 'cosponsor',
            'Primary Sponsor:': 'primary'}

        xpath = '//font[contains(., "Sponsor") and @color="#008080"]'
        headings = doc.xpath(xpath + '/text()')
        sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()')

        for h, s in zip(headings, sponsors):

            names = _cleanup_sponsors(s, chamber)
            type_ = sponsor_types[h.strip()]

            if names:
                for name, _chamber in names:
                    bill.add_sponsor(type_, name, chamber=_chamber)

        #---------------------------------------------------------------------
        # Versions

        tmp = '/'.join([
            'http://www.legis.delaware.gov',
            'LIS/lis{session_num}.nsf/vwLegislation',
            '{moniker}/$file/{filename}{format_}?open'])

        documents = self.scrape_documents(source=url,
                                     docname="introduced",
                                     filename="Legis",
                                     tmp=tmp,
                                     session_num=session_num)

        for d in documents:
            bill.add_version(**d)

        # If bill is a substitution, add the original as a version.
        names = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]/text()')
        urls = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]'
                         '/following-sibling::a/@href')

        for name, url in zip(names, urls):

            name = re_substitution.match(name).group(1)
            bill.add_version(name, url,
                             description='original bill')

        #---------------------------------------------------------------------
        # Actions
        actions = doc.xpath('//font[contains(., "Actions History")]'
                            '/../following-sibling::table/descendant::td[2]')
        actions = actions[0].text_content()
        actions = filter(None, actions.splitlines())

        for a in reversed(actions):
            date, action = a.split(' - ', 1)
            try:
                date = datetime.strptime(date, '%b %d, %Y')
            except ValueError:
                date = datetime.strptime(date, '%B %d, %Y')  # XXX: ugh.

            actor = actions_get_actor(action, bill['chamber'])
            attrs = dict(actor=actor, action=action, date=date)
            attrs.update(**self.categorizer.categorize(action))
            bill.add_action(**attrs)

        #---------------------------------------------------------------------
        # Votes
        vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()')

        # Sometimes vote strings are contained in weird, separate elements. Probably
        # hand edited.
        if not all(re.search('\d', string) for string in vote_strings):
            # Use the parent's text_content instead.
            vote_strings = []
            for el in doc.xpath('//*[contains(text(), "vote:")]/..'):
                vote_strings.append(el.text_content())

        vote_urls = doc.xpath('//*[contains(text(), "vote:")]'
                              '/following-sibling::a/@href')
        for string, url in zip(vote_strings, vote_urls):

            vote_data = parse_votestring(string)
            vote = self.scrape_vote(url, **vote_data)
            if vote:
                bill.add_vote(vote)

        #---------------------------------------------------------------------
        # Amendments
        xpath = ("//font[contains(., 'Amendments')]/"
                 "../../../td[2]/font/a")

        tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/'
               'vwLegislation/{id_}/$file/{filename}{format_}?open')

        for source, id_ in zip(doc.xpath(xpath + '/@href'),
                               doc.xpath(xpath + '/text()')):

            short_id = re_amendment.match(id_).group(1)

            documents = self.scrape_documents(
                source=source,
                docname='amendment (%s)' % short_id,
                filename='Legis',
                tmp=tmp, session_num=session_num,
                id_=id_)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Add any related "Engrossments".
        # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for
        # an explanation of the engrossment process in DE.
        source = doc.xpath('//img[@alt="Engrossment"]/../@href')

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/EngrossmentsforLookup',
                '{moniker}/$file/{filename}{format_}?open'])

            documents = self.scrape_documents(
                source=source[0],
                docname="Engrossment",
                filename="Engross",
                tmp=tmp,
                session_num=session_num,
                id_=bill['bill_id'])

            for d in documents:
                bill.add_version(**d)

        # --------------------------------------------------------------------
        # Add any fiscal notes.
        source = doc.xpath("//img[@alt='Fiscal Note']/../@href")

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/FiscalforLookup',
                '{docnum}/$file/{filename}{format_}?open'])

            documents = self.scrape_documents(
                source=source[0],
                docname="Fiscal Note",
                filename="Fiscal",
                tmp=tmp,
                session_num=session_num)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Extra fields

        # Helper to get the first td sibling of certain nodes.
        tmp = '//font[contains(., "%s")]/../../../td[2]'
        first_sibling_text = lambda heading: _get_text(tmp % heading)

        extra_fields = {
            # A long description of the legislation.
            "summary": "Synopsis",
            # Codification details for enacted legislation.
            "volume_chapter": "Volume Chapter",
            # Presumably the date of approval/veto.
            "date_governor_acted": "Date Governor Acted",
            "fiscal_notes": "Fiscal Notes",
        }

        for key, name in extra_fields.iteritems():
            try:
                bill[key] = first_sibling_text(name)
            except IndexError:
                # xpath lookup failed.
                pass

        self.save_bill(bill)
Exemple #39
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(
                u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()'
            )
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(
                u'//td/b[contains(text(),"Autor")]/../text()')[0]
            for aname in author.split(','):
                bill.add_sponsor('primary', aname.strip())

            co_authors = doc.xpath(
                u'//td/b[contains(text(),"Co-autor")]/../text()')
            if len(co_authors) != 0:
                for co_author in co_authors[1].split(','):
                    bill.add_sponsor('cosponsor', co_author.strip())

            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue

                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")

                action = tds[1].text_content().strip()
                #parse the text to see if it's a new version or a unrelated document
                #if has - let's *shrug* assume it's a vote document

                #get url of action
                action_url = tds[1].xpath('a/@href')

                #check it has a url and is not just text

                if action_url:
                    action_url = action_url[0]
                    #check if it's a version of the bill or another type of document.
                    #NOTE: not sure if new versions of the bill are only denoted with 'Entirillado' OR if that's the correct name but from what i gather it looks like it.
                    if re.match('Entirillado', action):
                        bill.add_version(action, action_url)
                    else:
                        bill.add_document(action, action_url)

                for pattern, atype in _classifiers:
                    if re.match(pattern, action):
                        break
                else:
                    atype = 'other'

                bill.add_action(chamber, action, date, type=atype)

                if atype == 'bill:passed' and action_url:
                    vote_chamber = None
                    for pattern, vote_chamber in _voteChambers:
                        if re.match(pattern, action):
                            break
                    else:
                        self.warning('coudnt find voteChamber pattern')

                    if vote_chamber == 'lower' and len(action_url) > 0:
                        vote = self.scrape_votes(action_url, action, date,
                                                 vote_chamber)
                        if not vote[0] == None:
                            vote[0].add_source(action_url)
                            bill.add_vote(vote[0])
                        else:
                            self.warning('Problem Reading vote: %s,%s' %
                                         (vote[1], bill_id))

            bill.add_source(url)
            self.save_bill(bill)
Exemple #40
0
    def scrape_for_bill_type(self, chamber, session, url):

        self.refresh_session()

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        # bills are all their own table with cellspacing=4 (skip first)
        bill_tables = doc.xpath('//table[@cellspacing="4"]')
        for bt in bill_tables[1:]:

            # each table has 3 rows: detail row, description, blank
            details, desc, _ = bt.xpath('tr')

            # first <tr> has img, button, sponsor, topic, current house
            #   current status, committee, committee2, last action
            tds = details.xpath('td')
            if len(tds) == 9:
                # middle _, _ is chamber, last action
                _, button, sponsor, subject, _, _, com1, com2, _ = tds
            elif len(tds) == 8:
                # middle _ is last action
                _, button, sponsor, subject, _, com1, com2, _ = tds
            else:
                self.warning('invalid row: (tds=%s) %s', len(tds),
                             details.text_content())
                continue

            # contains script tag that has a document.write that writes the
            # bill_id, we have to pull that out (gross, but only way)
            script_text = button.text_content()
            # skip SBIR/HBIR
            if 'SBIR' in script_text or 'HBIR' in script_text:
                continue
            """ script text looks like:
               document.write("<input type=button id=BTN71139 name=BTN71139 style='font-weight:normal' value='SB1'");
               document.write(" onClick=\"javascript:instrumentSelected(this,'71139','SB1','ON','ON','ON','");
               document.write(status + "','OFF','SB1-int.pdf,,','SB1-int.pdf,,')\">");
            """

            oid, bill_id, fnotes = re.findall(
                r"instrumentSelected\(this,'(\d+)','(\w+)','ON','ON','(ON|OFF)'",
                script_text)[0]
            second_piece = re.findall(
                r"status \+ \"','(ON|OFF)','([^,]*),([^,]*),([^,]*)\'",
                script_text)
            if second_piece:
                amend, intver, engver, enrver = second_piece[0]
            else:
                intver = engver = enrver = None

            sponsor = sponsor.text_content()
            subject = subject.text_content()
            com1 = com1.text_content()
            com2 = com2.text_content()
            desc = desc.text_content()

            if 'B' in bill_id:
                bill_type = 'bill'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'

            # title is missing on a few bills
            title = desc.strip()
            if not title:
                return

            # create bill
            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            if subject:
                bill['subjects'] = [subject]

            if fnotes == 'ON':
                bill.add_document(
                    'fiscal notes',
                    'http://alisondb.legislature.state.al.us/acas/ACTIONFiscalNotesFrameMac.asp?OID=%s&LABEL=%s'
                    % (oid, bill_id))

            self.get_sponsors(bill, oid)
            self.get_actions(bill, oid)

            # craft bill URLs
            if intver:
                bill.add_version('introduced',
                                 self.base_doc_url + intver,
                                 mimetype='application/pdf')
            if engver:
                bill.add_version('engrossed',
                                 self.base_doc_url + engver,
                                 mimetype='application/pdf')
            if enrver:
                bill.add_version('enrolled',
                                 self.base_doc_url + enrver,
                                 mimetype='application/pdf')

            self.save_bill(bill)
Exemple #41
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {
            2: 'bill',
            4: 'resolution',
            7: 'concurrent resolution',
            8: 'joint resolution'
        }

        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (
                insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (
                    insert, link)

                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath(
                        'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)'
                    )
                    title = root.xpath(
                        'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)'
                    )

                    bill = Bill(session,
                                chamber,
                                bill_id,
                                title,
                                type=bill_type)

                    bill_text = root.xpath(
                        "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)"
                    )
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)

                    primary, secondary = self.scrape_sponsors(page)

                    if primary and primary[0] == 'By:':
                        primary.pop(0)

                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    minutes_count = 2
                    for mr in root.xpath('//table[4]/tr/td[3]/a'):
                        minutes = mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[
                            1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1

                    self.scrape_actions(root, bill, "upper")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Exemple #42
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        with self.urlopen(url) as bill_page:
            html = lxml.html.fromstring(bill_page)
            html.make_links_absolute(
                'http://legislature.idaho.gov/legislation/%s/' % session)
            bill_tables = html.xpath('./body/table/tr/td[2]')[0].xpath(
                './/table')
            title = bill_tables[1].text_content().strip()
            bill_type = get_bill_type(bill_id)
            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_source(url)
            bill['subjects'] = self._subjects[bill_id.replace(' ', '')]

            if short_title and bill['title'].lower() != short_title.lower():
                bill.add_title(short_title)

            # documents
            doc_links = html.xpath('//span/a')
            for link in doc_links:
                name = link.text_content().strip()
                href = link.get('href')
                if 'Engrossment' in name or 'Bill Text' in name:
                    bill.add_version(name, href)
                else:
                    bill.add_document(name, href)

            # sponsors range from a committee to one legislator to a group of legs
            sponsor_lists = bill_tables[0].text_content().split('by')
            if len(sponsor_lists) > 1:
                for sponsors in sponsor_lists[1:]:
                    for person in sponsors.split(','):
                        bill.add_sponsor('primary', person)

            actor = chamber
            last_date = None
            for row in bill_tables[2]:
                # lots of empty rows
                if len(row) == 1:
                    continue
                _, date, action, _ = [x.text_content().strip() for x in row]

                if date:
                    last_date = date
                else:
                    date = last_date

                date = datetime.datetime.strptime(date + '/' + session[0:4],
                                                  "%m/%d/%Y")
                if action.startswith('House'):
                    actor = 'lower'
                elif action.startswith('Senate'):
                    actor = 'upper'

                # votes
                if 'AYES' in action or 'NAYS' in action:
                    vote = self.parse_vote(actor, date, row[2])
                    vote.add_source(url)
                    bill.add_vote(vote)
                # some td's text is seperated by br elements
                if len(row[2]):
                    action = "".join(row[2].itertext())
                action = action.replace(u'\xa0', ' ').strip()
                atype = get_action(actor, action)
                bill.add_action(actor, action, date, type=atype)
                # after voice vote/roll call and some actions the bill is sent
                # 'to House' or 'to Senate'
                if 'to House' in action:
                    actor = 'lower'
                elif 'to Senate' in action:
                    actor = 'upper'
            self.save_bill(bill)
Exemple #43
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        bill_dir_page = self.get(url)
        root = lxml.etree.fromstring(bill_dir_page.content)
        for mr in root.xpath('//LASTACTION/MSRGROUP'):
            bill_id = mr.xpath('string(MEASURE)').replace(" ", "")
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"

            bill_type = {'B':'bill', 'C': 'concurrent resolution',
                         'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

            # just skip past bills that are of the wrong chamber
            if chamber != chamber_to_scrape:
                continue

            link = mr.xpath('string(ACTIONLINK)').replace("..", "")
            main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "")
            main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
            bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link)
            details_page = self.get(bill_details_url)

            page = details_page.content.replace(chr(11), "")
            # Some pages have the (invalid) byte 11 sitting around. Just drop
            # them out. Might as well.

            details_root = lxml.etree.fromstring(page)
            title = details_root.xpath('string(//SHORTTITLE)')
            longtitle = details_root.xpath('string(//LONGTITLE)')

            bill = Bill(session, chamber, bill_id, title,
                        type=bill_type, summary=longtitle)

            #sponsors
            main_sponsor = details_root.xpath('string(//P_NAME)').split()
            if main_sponsor:
                main_sponsor = main_sponsor[0]
                main_sponsor_link = details_root.xpath('string(//P_LINK)').replace(" ", "_")
                main_sponsor_url =  'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link)
                type = "primary"
                bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url)
            for author in details_root.xpath('//AUTHORS/ADDITIONAL'):
                leg = author.xpath('string(CO_NAME)').replace(" ", "_")
                if leg:
                    leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg)
                    type = "cosponsor"
                    bill.add_sponsor(type, leg, leg_url=leg_url)

            #Versions 
            curr_version = details_root.xpath('string(//CURRENT_OTHER)').replace("../../../../", "")
            if curr_version != "":
                curr_version_url = "http://billstatus.ls.state.ms.us/" \
                        + curr_version
                bill.add_version("Current version", curr_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')

            intro_version = details_root.xpath('string(//INTRO_OTHER)').replace("../../../../", "")
            if intro_version != "":
                intro_version_url = "http://billstatus.ls.state.ms.us/"\
                        + intro_version
                bill.add_version("As Introduced", intro_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')

            comm_version = details_root.xpath('string(//CMTESUB_OTHER)').replace("../../../../", "")
            if comm_version.find("documents") != -1:
                comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                bill.add_version("Committee Substitute", comm_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')
            passed_version = details_root.xpath('string(//PASSED_OTHER)').replace("../../../../", "")
            if passed_version.find("documents") != -1:
                passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                title = "As Passed the " + chamber
                bill.add_version(title, passed_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')

            asg_version = details_root.xpath('string(//ASG_OTHER)').replace("../../../../", "")
            if asg_version.find("documents") != -1:
                asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                bill.add_version("Approved by the Governor", asg_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')


            # avoid duplicate votes
            seen_votes = set()

            #Actions
            for action in details_root.xpath('//HISTORY/ACTION'):
                action_num  = action.xpath('string(ACT_NUMBER)').strip()
                action_num = int(action_num)
                act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "")
                action_desc = action.xpath('string(ACT_DESC)')
                date, action_desc = action_desc.split(" ", 1)
                date = date + "/" + session[0:4]
                date = datetime.strptime(date, "%m/%d/%Y")

                if action_desc.startswith("(H)"):
                    actor = "lower"
                    action = action_desc[4:]
                elif action_desc.startswith("(S)"):
                    actor = "upper"
                    action = action_desc[4:]
                else:
                    actor = "executive"
                    action = action_desc

                if action.find("Veto") != -1:
                    version_path = details_root.xpath("string(//VETO_OTHER)")
                    version_path = version_path.replace("../../../../", "")
                    version_url = "http://billstatus.ls.state.ms.us/" + version_path
                    bill.add_document("Veto", version_url) 

                atype = 'other'
                for prefix, prefix_type in self._action_types:
                    if action.startswith(prefix):
                        atype = prefix_type
                        break

                bill.add_action(actor, action, date, type=atype,
                                action_num=action_num)

                # use committee names as scraped subjects
                subjects = details_root.xpath('//H_NAME/text()')
                subjects += details_root.xpath('//S_NAME/text()')
                bill['subjects'] = subjects

                if act_vote:
                    vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                    if vote_url not in seen_votes:
                        seen_votes.add(vote_url)
                        vote = self.scrape_votes(vote_url, action,
                                                 date, actor)
                        vote.add_source(vote_url)
                        bill.add_vote(vote)

            bill.add_source(bill_details_url)
            self.save_bill(bill)
Exemple #44
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).content
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if (bill_title is None or "Bill does not exist" in history_xml):
            self.warning("Bill does not appear to exist")
            return
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        bill.add_source(history_url)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version(name=self.NAME_SLUGS[version[1][-5]],
                             url=version[1],
                             mimetype='text/html')

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document(name="Analysis ({})".format(
                self.NAME_SLUGS[analysis[1][-5]]),
                              url=analysis[1],
                              mimetype='text/html')

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document(name="Fiscal Note ({})".format(
                self.NAME_SLUGS[fiscal_note[1][-5]]),
                              url=fiscal_note[1],
                              mimetype='text/html')

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document(name="Witness List ({})".format(
                self.NAME_SLUGS[witness[1][-5]]),
                              url=witness[1],
                              mimetype='text/html')

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {
                'H': 'lower',
                'S': 'upper',
                'E': 'executive'
            }[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment:introduced'
            elif desc == 'Amendment amended':
                atype = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment:withdrawn'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'bill:passed'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'bill:introduced'
                else:
                    atype = 'bill:filed'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'governor:received'
            elif desc.startswith('Signed by the Governor'):
                atype = 'governor:signed'
            elif desc == 'Vetoed by the Governor':
                atype = 'governor:vetoed'
            elif desc == 'Read first time':
                atype = ['bill:introduced', 'bill:reading:1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['bill:passed']
                if not introduced:
                    introduced = True
                    atype.append('bill:introduced')
            elif desc == "Passed as amended":
                atype = 'bill:passed'
            elif (desc.startswith('Referred to')
                  or desc.startswith("Recommended to be sent to ")):
                atype = 'committee:referred'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee:passed'
            elif desc == "Filed":
                atype = 'bill:filed'
            elif desc == 'Read 3rd time':
                atype = 'bill:reading:3'
            elif desc == 'Read 2nd time':
                atype = 'bill:reading:2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee:passed:favorable'
            else:
                atype = 'other'

            if 'committee:referred' in atype:
                repls = ['Referred to', "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                extra['committees'] = ctty

            bill.add_action(actor,
                            action.findtext('description'),
                            act_date,
                            type=atype,
                            **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('primary', author, official_type='author')
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('cosponsor',
                                 coauthor,
                                 official_type='coauthor')
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('primary', sponsor, official_type='sponsor')
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor',
                                 cosponsor,
                                 official_type='cosponsor')

        self.save_bill(bill)
Exemple #45
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        with self.urlopen(url) as bill_dir_page:
            root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser())
            for mr in root.xpath('//lastaction/msrgroup'):
                bill_id = mr.xpath('string(measure)').replace(" ", "")
                if bill_id[0] == "S":
                    chamber = "upper"
                else:
                    chamber = "lower"

                bill_type = {'B':'bill', 'C': 'concurrent resolution',
                             'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

                # just skip past bills that are of the wrong chamber
                if chamber != chamber_to_scrape:
                    continue

                link = mr.xpath('string(actionlink)').replace("..", "")
                main_doc = mr.xpath('string(measurelink)').replace("../../../", "")
                main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
                bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link)
                with self.urlopen(bill_details_url) as details_page:
                    details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                    details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                    title = details_root.xpath('string(//shorttitle)')
                    longtitle = details_root.xpath('string(//longtitle)')

                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type, longtitle=longtitle)

                    #sponsors
                    main_sponsor = details_root.xpath('string(//p_name)').split()
                    if main_sponsor:
                        main_sponsor = main_sponsor[0]
                        main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_")
                        main_sponsor_url =  'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link)
                        type = "primary"
                        bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url)
                    for author in details_root.xpath('//authors/additional'):
                        leg = author.xpath('string(co_name)').replace(" ", "_")
                        if leg:
                            leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg)
                            type = "cosponsor"
                            bill.add_sponsor(type, leg, leg_url=leg_url)

                    #Versions 
                    curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "")
                    curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version
                    bill.add_version("Current version", curr_version_url)

                    intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "")
                    intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version
                    bill.add_version("As Introduced", intro_version_url)

                    comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "")
                    if comm_version.find("documents") != -1:
                        comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                        bill.add_version("Committee Substitute", comm_version_url)

                    passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "")
                    if passed_version.find("documents") != -1:
                        passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                        title = "As Passed the " + chamber
                        bill.add_version(title, passed_version_url)

                    asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "")
                    if asg_version.find("documents") != -1:
                        asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                        bill.add_version("Approved by the Governor", asg_version_url)

                    # avoid duplicate votes
                    seen_votes = set()

                    #Actions
                    for action in details_root.xpath('//history/action'):
                        action_num  = action.xpath('string(act_number)').strip()
                        action_num = int(action_num)
                        act_vote = action.xpath('string(act_vote)').replace("../../../..", "")
                        action_desc = action.xpath('string(act_desc)')
                        date, action_desc = action_desc.split(" ", 1)
                        date = date + "/" + session[0:4]
                        date = datetime.strptime(date, "%m/%d/%Y")

                        if action_desc.startswith("(H)"):
                            actor = "lower"
                            action = action_desc[4:]
                        elif action_desc.startswith("(S)"):
                            actor = "upper"
                            action = action_desc[4:]
                        else:
                            actor = "executive"
                            action = action_desc

                        if action.find("Veto") != -1:
                            version_path = details_root.xpath("string(//veto_other)")
                            version_path = version_path.replace("../../../../", "")
                            version_url = "http://billstatus.ls.state.ms.us/" + version_path
                            bill.add_document("Veto", version_url) 

                        atype = 'other'
                        for prefix, prefix_type in self._action_types:
                            if action.startswith(prefix):
                                atype = prefix_type
                                break

                        bill.add_action(actor, action, date, type=atype,
                                        action_num=action_num)

                        # use committee names as scraped subjects
                        subjects = details_root.xpath('//h_name/text()')
                        subjects += details_root.xpath('//s_name/text()')
                        bill['subjects'] = subjects

                        if act_vote:
                            vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                            if vote_url not in seen_votes:
                                seen_votes.add(vote_url)
                                vote = self.scrape_votes(vote_url, action,
                                                         date, actor)
                                vote.add_source(vote_url)
                                bill.add_vote(vote)

                    bill.add_source(bill_details_url)
                    self.save_bill(bill)
Exemple #46
0
    def parse_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            try:
                version_link = page.xpath(
                    "//a[contains(@href, '%s/bill.doc')]" % bill_id)[0]
            except IndexError:
                # Bill withdrawn
                return

            title = version_link.xpath("string(following-sibling::p[1])")
            title = re.sub(ur'[\s\xa0]+', ' ', title).strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            bill.add_version("Most Recent Version",
                             version_link.attrib['href'])

            for link in page.xpath("//a[contains(@href, 'legislator/')]"):
                bill.add_sponsor('primary', link.text.strip())

            action_p = version_link.xpath("following-sibling::p[2]")[0]
            for line in action_p.xpath("string()").split("\n"):
                action = line.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = "%s %s" % (action.split('-')[0],
                                         session[0:4])
                action_date = datetime.datetime.strptime(
                    action_date, '%b %d %Y')

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                atype = []
                if action.startswith('introduced in'):
                    atype.append('bill:introduced')
                elif action.startswith('signed by Governor'):
                    atype.append('governor:signed')
                elif re.match(r'^to [A-Z]', action):
                    atype.append('committee:referred')

                if '1st reading' in action:
                    atype.append('bill:reading:1')
                if '3rd reading' in action:
                    atype.append('bill:reading:3')
                if '2nd reading' in action:
                    atype.append('bill:reading:2')

                amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*'
                                r'( and \([a-z\d\-]+\))? filed')
                if re.search(amendment_re, action):
                    atype.append('amendment:introduced')

                if not atype:
                    atype = ['other']

                bill.add_action(actor, action, action_date, type=atype)

            try:
                votes_link = page.xpath(
                    "//a[contains(@href, 'vote_history.pdf')]")[0]
                bill.add_document("Vote History",
                                  votes_link.attrib['href'])
            except IndexError:
                # No votes
                pass

            self.save_bill(bill)
Exemple #47
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for current year
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.get(url).text
        # if first page isn't found, try second year
        if ('Page Not Found' in html
                or 'The bill you are looking for is not available yet' in html):
            html = self.get('http://legislature.mi.gov/doc.aspx?%s-%s'
                            % (session[-4:], bill_id.replace(' ','-'))).text
            if ('Page Not Found' in html
                or 'The bill you are looking for is not available yet' in html):
                return None

        doc = lxml.html.fromstring(html)

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(session=session, chamber=chamber, bill_id=bill_id,
                    title=title, type=bill_type)
        bill.add_source(url)

        # sponsors
        sp_type = 'primary'
        for sponsor in doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ')
            bill.add_sponsor(sp_type, sponsor)
            sp_type = 'cosponsor'

        bill['subjects'] = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = datetime.datetime.strptime(date, "%m/%d/%Y")
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            type = categorize_action(action)
            bill.add_action(actor, action, date, type=type)

            # check if action mentions a vote
            rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    vote = Vote(actor, date, action, False, 0, 0, 0)
                    self.parse_roll_call(vote, vote_url, rc_num)

                    # check the expected counts vs actual
                    count = re.search('YEAS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['yes_votes']):
                        self.warning('vote count mismatch for %s %s, %d != %d' % 
                                     (bill_id, action, count, len(vote['yes_votes'])))
                    count = re.search('NAYS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['no_votes']):
                        self.warning('vote count mismatch for %s %s, %d != %d' % 
                                     (bill_id, action, count, len(vote['no_votes'])))

                    vote['yes_count'] = len(vote['yes_votes'])
                    vote['no_count'] = len(vote['no_votes'])
                    vote['other_count'] = len(vote['other_votes'])
                    vote['passed'] = vote['yes_count'] > vote['no_count']
                    vote.add_source(vote_url)
                    bill.add_vote(vote)
                else:
                    self.warning("missing journal link for %s %s" % 
                                 (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            version = self.parse_doc_row(row)
            if version:
                if version[1].endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif version[1].endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version(*version, mimetype=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)

        self.save_bill(bill)
        return True
Exemple #48
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self._subjects[bill_id.replace(' ', '')]

        if short_title and bill['title'].lower() != short_title.lower():
            bill.add_title(short_title)

        # documents
        doc_links = html.xpath('//div[contains(@class,"pf-content")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get('href')
            if 'Engrossment' in name or 'Bill Text' in name:
                bill.add_version(name, href, mimetype='application/pdf')
            else:
                bill.add_document(name, href)

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split('by')
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if 'COMMITTEE' in sponsors.upper():
                    bill.add_sponsor('primary', sponsors.strip())
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsor('primary', person)

        actor = chamber
        last_date = None
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date

            date = datetime.datetime.strptime(date + '/' + session[0:4],
                                              "%m/%d/%Y")
            if action.startswith('House'):
                actor = 'lower'
            elif action.startswith('Senate'):
                actor = 'upper'

            # votes
            if 'AYES' in action or 'NAYS' in action:
                vote = self.parse_vote(actor, date, row[2])
                vote.add_source(url)
                bill.add_vote(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u'\xa0', ' ').strip()
            atype = get_action(actor, action)
            bill.add_action(actor, action, date, type=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if 'to House' in action:
                actor = 'lower'
            elif 'to Senate' in action:
                actor = 'upper'
        self.save_bill(bill)
Exemple #49
0
    def scrape_bills(self, session, year_abr):
        #Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            if rec['IdenticalBillNumber'].strip():
                bill.add_companion(rec['IdenticalBillNumber'].split()[0])

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            if rec['DocType'] in self._version_types:
                # Clean HTMX links.
                if htm_url.endswith('HTMX'):
                    htm_url = re.sub('X$', '', htm_url)

                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                bill.add_version(doc_name, htm_url, mimetype=mimetype)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = zipedfile.open(vfile, 'U')
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:

                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join((bill_id, chamber, action))
                    vote_id = vote_id.replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = Vote(chamber, date, action, None, None,
                                              None, None, bill_id=bill_id)
                    if vote_file_type == 'committee':
                        votes[vote_id]['committee'] = self._committees[
                            rec['Committee_House']]

                    if leg_vote == "Y":
                        votes[vote_id].yes(leg)
                    elif leg_vote == "N":
                        votes[vote_id].no(leg)
                    else:
                        votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count

                # Veto override.
                if vote['motion'] == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    vote['passed'] = False
                    if vote['chamber'] == 'lower':
                        if vote_yes_count >= 54:
                            vote['passed'] = True
                    elif vote['chamber'] == 'upper':
                        if vote_yes_count >= 27:
                            vote['passed'] = True

                # Regular vote.
                elif vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            if not bill['actions'] and not bill['versions']:
                self.warning('probable phony bill detected %s',
                             bill['bill_id'])
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                self.save_bill(bill)

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
Exemple #50
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['doctype']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['doctype'], bill_id))
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith('A') or filename.startswith('CA'):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith('C'):
                vote_file_type = 'committee'
            else:
                vote_file_type = 'chamber'

            for rec in vdict_file:

                if vote_file_type == 'chamber':
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                    leg = rec['Name']
                    # drop time portion
                    date = rec['Agenda_Date'].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec['BillAction']]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec['LegislatorVote'][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = '_'.join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, None, None,
                                          None, None, bill_id=bill_id)
                if vote_file_type == 'committee':
                    votes[vote_id]['committee'] = self._committees[
                        rec['Committee_House']]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')


        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Exemple #51
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath(
            '//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsor('primary', sponsors[0])

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsor('cosponsor', sponsor)
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '',
                                   spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsor('primary', spons_str)

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    self.parse_vote(bill, action, act_chamber, act_date,
                                    vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1),
                                                      '%m/%d/%y')

            bill.add_action(act_chamber, action, act_date, type=atype)

        # Get subjects
        bill['subjects'] = []
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill['subjects'].append(subj.strip())

        # Get versions
        text_list_url = "http://www.legis.state.ak.us/"\
            "basis/get_fulltext.asp?session=%s&bill=%s" % (
            session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version(name, text_url, mimetype="text/html")

        # Get documents
        doc_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_documents.asp?session=%s&bill=%s" % (
                    session, bill_id )
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document(h_name, h_href)

        self.save_bill(bill)
Exemple #52
0
    def scrape_bill(self, session, session_number, bill_id, title, sponsor,
                    url):
        try:
            html = self.get(url).text
        except:
            return
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill = Bill(session, self.CHAMBERS[bill_id[0]], bill_id, title)
        bill.add_source(url)

        sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor)
        bill.add_sponsor('primary', sponsor)

        hist_table = page.xpath("//div[@id = 'tabBodyBillHistory']//table")[0]

        if bill_id.startswith('SB ') or \
                bill_id.startswith('HB ') or \
                bill_id.startswith('SPB ') or \
                bill_id.startswith('HPB '):
            bill_type = 'bill'
        elif bill_id.startswith('HR ') or bill_id.startswith('SR '):
            bill_type = 'resolution'
        elif bill_id.startswith('HJR ') or bill_id.startswith('SJR '):
            bill_type = 'joint resolution'
        elif bill_id.startswith('SCR ') or bill_id.startswith('HCR '):
            bill_type = 'concurrent resolution'
        elif bill_id.startswith('SM ') or bill_id.startswith('HM '):
            bill_type = 'memorial'
        else:
            raise Exception('Failed to identify bill type.')

        bill['type'] = [bill_type]

        for tr in hist_table.xpath("tbody/tr"):
            date = tr.xpath("string(td[1])")
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[2])")
            actor = {'Senate': 'upper', 'House': 'lower'}.get(actor, actor)

            if not actor:
                continue

            act_text = tr.xpath("string(td[3])").strip()
            for action in act_text.split(u'\u2022'):
                action = action.strip()
                if not action:
                    continue

                action = re.sub(r'-(H|S)J\s+(\d+)$', '', action)

                atype = []
                if action.startswith('Referred to'):
                    atype.append('committee:referred')
                elif action.startswith('Favorable by'):
                    atype.append('committee:passed')
                elif action == "Filed":
                    atype.append("bill:filed")
                elif action.startswith("Withdrawn"):
                    atype.append("bill:withdrawn")
                elif action.startswith("Died"):
                    atype.append("bill:failed")
                elif action.startswith('Introduced'):
                    atype.append('bill:introduced')
                elif action.startswith('Read 2nd time'):
                    atype.append('bill:reading:2')
                elif action.startswith('Read 3rd time'):
                    atype.append('bill:reading:3')
                elif action.startswith('Adopted'):
                    atype.append('bill:passed')
                elif action.startswith('CS passed'):
                    atype.append('bill:passed')
                elif action.startswith('Approved by Gov'):
                    atype.apend('governor:signed')

                bill.add_action(actor, action, date, type=atype)

        try:
            version_table = page.xpath(
                "//div[@id = 'tabBodyBillText']/table")[0]
            for tr in version_table.xpath("tbody/tr"):
                name = tr.xpath("string(td[1])").strip()
                version_url = tr.xpath("td/a[1]")[0].attrib['href']
                if version_url.endswith('PDF'):
                    mimetype = 'application/pdf'
                elif version_url.endswith('HTML'):
                    mimetype = 'text/html'
                bill.add_version(name, version_url, mimetype=mimetype)
        except IndexError:
            self.log("No version table for %s" % bill_id)

        try:
            analysis_table = page.xpath(
                "//div[@id = 'tabBodyAnalyses']/table")[0]
            for tr in analysis_table.xpath("tbody/tr"):
                name = tr.xpath("string(td[1])").strip()
                name += " -- " + tr.xpath("string(td[3])").strip()
                name = re.sub(r'\s+', " ", name)
                date = tr.xpath("string(td[4])").strip()
                if date:
                    name += " (%s)" % date
                analysis_url = tr.xpath("td/a")[0].attrib['href']
                bill.add_document(name, analysis_url)
        except IndexError:
            self.log("No analysis table for %s" % bill_id)

        vote_tables = page.xpath("//div[@id = 'tabBodyVoteHistory']//table")

        for vote_table in vote_tables:
            for tr in vote_table.xpath("tbody/tr"):
                vote_date = tr.xpath("string(td[3])").strip()
                if vote_date.isalpha():
                    vote_date = tr.xpath("string(td[2])").strip()
                try:
                    vote_date = datetime.datetime.strptime(
                        vote_date, "%m/%d/%Y %H:%M %p").date()
                except ValueError:
                    msg = 'Got bogus vote date: %r'
                    self.logger.warning(msg % vote_date)

                vote_url = tr.xpath("td[4]/a")[0].attrib['href']
                if "SenateVote" in vote_url:
                    self.scrape_floor_vote('upper', bill, vote_date, vote_url)
                elif "HouseVote" in vote_url:
                    self.scrape_floor_vote('lower', bill, vote_date, vote_url)
                else:
                    self.scrape_uppper_committee_vote(bill, vote_date,
                                                      vote_url)
        else:
            self.log("No vote table for %s" % bill_id)

        self.scrape_lower_committee_votes(session_number, bill)

        self.save_bill(bill)
Exemple #53
0
    def scrape_bill(self,
                    url,
                    kw,
                    re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'),
                    re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'),
                    re_digits=re.compile(r'\d{,5}'),
                    actions_categorize=actions.categorize,
                    actions_get_actor=actions.get_actor):

        bill = Bill(**kw)
        bill.add_source(url)

        #---------------------------------------------------------------------
        # A few helpers.
        _url_2_lxml = self._url_2_lxml
        _cleanup_sponsors = self._cleanup_sponsors

        # Shortcut function partial to get text at a particular xpath:
        doc = _url_2_lxml(url)
        _get_text = partial(get_text, doc, 0)

        # Get session number--needed for fetching related documents (see below).
        xpath = '//font[contains(., "General Assembly") and @face="Arial"]'
        session_num = doc.xpath(xpath)[0].text_content()
        session_num = re_digits.match(session_num).group()

        #---------------------------------------------------------------------
        # Sponsors
        chamber = bill['chamber']

        sponsor_types = {
            'Additional Sponsor(s):': 'cosponsor',
            'CoSponsors:': 'cosponsor',
            'Primary Sponsor:': 'primary'
        }

        xpath = '//font[contains(., "Sponsor") and @color="#008080"]'
        headings = doc.xpath(xpath + '/text()')
        sponsors = doc.xpath(xpath +
                             '/../../following-sibling::td/font/text()')

        for h, s in zip(headings, sponsors):

            names = _cleanup_sponsors(s, chamber)
            type_ = sponsor_types[h.strip()]

            if names:
                for name, _chamber in names:
                    bill.add_sponsor(type_, name, chamber=_chamber)

        #---------------------------------------------------------------------
        # Versions

        tmp = '/'.join([
            'http://www.legis.delaware.gov',
            'LIS/lis{session_num}.nsf/vwLegislation',
            '{moniker}/$file/{filename}{format_}?open'
        ])

        documents = self.scrape_documents(source=url,
                                          docname="introduced",
                                          filename="Legis",
                                          tmp=tmp,
                                          session_num=session_num)

        for d in documents:
            bill.add_version(**d)

        # If bill is a substitution, add the original as a version.
        names = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]/text()')
        urls = doc.xpath('//*[contains(text(), "Substituted '
                         'Legislation for Bill:")]'
                         '/following-sibling::a/@href')

        for name, url in zip(names, urls):

            name = re_substitution.match(name).group(1)
            bill.add_version(name, url, description='original bill')

        #---------------------------------------------------------------------
        # Actions
        actions = doc.xpath('//font[contains(., "Actions History")]'
                            '/../following-sibling::table/descendant::td[2]')
        actions = actions[0].text_content()
        actions = filter(None, actions.splitlines())

        for a in reversed(actions):
            date, action = a.split(' - ', 1)
            try:
                date = datetime.strptime(date, '%b %d, %Y')
            except ValueError:
                date = datetime.strptime(date, '%B %d, %Y')  # XXX: ugh.

            actor = actions_get_actor(action, bill['chamber'])
            type_ = actions_categorize(action)
            bill.add_action(actor, action, date, type_)

        #---------------------------------------------------------------------
        # Votes
        vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()')

        # Sometimes vote strings are contained in weird, separate elements. Probably
        # hand edited.
        if not all(re.search('\d', string) for string in vote_strings):
            # Use the parent's text_content instead.
            vote_strings = []
            for el in doc.xpath('//*[contains(text(), "vote:")]/..'):
                vote_strings.append(el.text_content())

        vote_urls = doc.xpath('//*[contains(text(), "vote:")]'
                              '/following-sibling::a/@href')
        for string, url in zip(vote_strings, vote_urls):

            vote_data = parse_votestring(string)
            vote = self.scrape_vote(url, **vote_data)
            if vote:
                bill.add_vote(vote)

        #---------------------------------------------------------------------
        # Amendments
        xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a")

        tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/'
               'vwLegislation/{id_}/$file/{filename}{format_}?open')

        for source, id_ in zip(doc.xpath(xpath + '/@href'),
                               doc.xpath(xpath + '/text()')):

            short_id = re_amendment.match(id_).group(1)

            documents = self.scrape_documents(source=source,
                                              docname='amendment (%s)' %
                                              short_id,
                                              filename='Legis',
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=id_)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Add any related "Engrossments".
        # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for
        # an explanation of the engrossment process in DE.
        source = doc.xpath('//img[@alt="Engrossment"]/../@href')

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/EngrossmentsforLookup',
                '{moniker}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Engrossment",
                                              filename="Engross",
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=bill['bill_id'])

            for d in documents:
                bill.add_version(**d)

        # --------------------------------------------------------------------
        # Add any fiscal notes.
        source = doc.xpath("//img[@alt='Fiscal Note']/../@href")

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/FiscalforLookup',
                '{docnum}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Fiscal Note",
                                              filename="Fiscal",
                                              tmp=tmp,
                                              session_num=session_num)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Extra fields

        # Helper to get the first td sibling of certain nodes.
        tmp = '//font[contains(., "%s")]/../../../td[2]'
        first_sibling_text = lambda heading: _get_text(tmp % heading)

        extra_fields = {
            # A long description of the legislation.
            "summary": "Synopsis",
            # Codification details for enacted legislation.
            "volume_chapter": "Volume Chapter",
            # Presumably the date of approval/veto.
            "date_governor_acted": "Date Governor Acted",
            "fiscal_notes": "Fiscal Notes",
        }

        for key, name in extra_fields.iteritems():
            try:
                bill[key] = first_sibling_text(name)
            except IndexError:
                # xpath lookup failed.
                pass

        self.save_bill(bill)
Exemple #54
0
    def parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self.senate_base_url, url)

        with self.urlopen(url) as bill_page:
            bill_page = lxml.html.fromstring(bill_page)

            bill_id = bill_page.xpath('//*[@class="entry-title"]')
            if len(bill_id) == 0:
                self.log("WARNING: bill summary page is blank! (%s)" % url)
                self.bad_urls.append(url)
                return
            bill_id = bill_id[0].text_content()
            bill_id = clean_text(bill_id)

            bill_desc = bill_page.xpath(
                '//*[@class="BillDescription"]')[0].text_content()
            bill_desc = clean_text(bill_desc)

            table_rows = bill_page.xpath('//table/tr')
            # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor:
            cosponsorOffset = 0
            if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
                cosponsorOffset = 1

            lr_label_tag = table_rows[3 + cosponsorOffset]
            assert lr_label_tag[0].text_content().strip() == 'LR Number:'
            bill_lr = lr_label_tag[1].text_content()

            lastActionOffset = 0
            if table_rows[4 + cosponsorOffset][0].text_content().strip(
            ) == 'Governor Action:':
                lastActionOffset = 1
            official_title_tag = table_rows[5 + cosponsorOffset +
                                            lastActionOffset]
            assert official_title_tag[0].text_content().strip(
            ) == 'Bill String:'
            official_title = official_title_tag[1].text_content()

            # could substitute the description for the name,
            # but keeping it separate for now.

            bill_type = "bill"
            triplet = bill_id[:3]
            if triplet in bill_types:
                bill_type = bill_types[triplet]

            subs = []
            bid = bill_id.replace(" ", "")

            if bid in self.subjects:
                subs = self.subjects[bid]
                self.log("With subjects for this bill")

            self.log(bid)

            bill = Bill(session,
                        'lower',
                        bill_id,
                        bill_desc,
                        bill_url=url,
                        bill_lr=bill_lr,
                        official_title=official_title,
                        type=bill_type,
                        subjects=subs)
            bill.add_source(url)

            bill_sponsor = clean_text(table_rows[0][1].text_content())
            try:
                bill_sponsor_link = table_rows[0][1][0].attrib['href']
            except IndexError:
                return

            if bill_sponsor_link:
                bill_sponsor_link = '%s%s' % (self.senate_base_url,
                                              bill_sponsor_link)

            bill.add_sponsor('primary',
                             bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # check for cosponsors
            if cosponsorOffset == 1:
                if len(table_rows[2][1]) == 1:  # just a name
                    cosponsor = table_rows[2][1][0]
                    bill.add_sponsor(
                        'cosponsor',
                        cosponsor.text_content(),
                        sponsor_link='%s/%s' %
                        (self.senate_base_url, cosponsor.attrib['href']))
                else:  # name ... etal
                    try:
                        cosponsor = table_rows[2][1][0]
                        bill.add_sponsor(
                            'cosponsor',
                            clean_text(cosponsor.text_content()),
                            sponsor_link='%s/%s' %
                            (self.senate_base_url, cosponsor.attrib['href']))
                        self.parse_cosponsors_from_bill(
                            bill,
                            '%s/%s' % (self.senate_base_url,
                                       table_rows[2][1][1].attrib['href']))
                    except scrapelib.HTTPError as e:
                        self.log("WARNING: " + str(e))
                        self.bad_urls.append(url)
                        self.log("WARNING: no bill summary page (%s)" % url)

            actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
            actions_link = '%s/%s' % (self.senate_base_url,
                                      actions_link_tag.attrib['href'])
            actions_link = re.sub("content", "print", actions_link)
            self.parse_house_actions(bill, actions_link)

            # get bill versions
            doc_tags = bill_page.xpath(
                '//div[@class="BillDocsSection"][1]/span')
            for doc_tag in reversed(doc_tags):
                doc = clean_text(doc_tag.text_content())
                text_url = '%s%s' % (self.senate_base_url,
                                     doc_tag[0].attrib['href'])
                bill.add_document(doc, text_url, mimetype="text/html")

            # get bill versions
            version_tags = bill_page.xpath(
                '//div[@class="BillDocsSection"][2]/span')
            for version_tag in reversed(version_tags):
                version = clean_text(version_tag.text_content())
                text_url = '%s%s' % (self.senate_base_url,
                                     version_tag[0].attrib['href'])
                pdf_url = '%s%s' % (self.senate_base_url,
                                    version_tag[1].attrib['href'])
                if text_url.endswith('htm'):
                    mimetype = 'text/html'
                elif text_url.endswith('pdf'):
                    mimetype = 'application/pdf'
                bill.add_version(version,
                                 text_url,
                                 pdf_url=pdf_url,
                                 on_duplicate='use_new',
                                 mimetype=mimetype)
        self.save_bill(bill)