Exemple #1
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('center table center')[0]

            for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table')[-1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Exemple #2
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
            bill.add_sponsor('primary', author.strip())

            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue


                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
                action = tds[1].text_content()
                bill.add_action(chamber, action, date)

                # also has an associated version
                if tds[1].xpath('a'):
                    bill.add_version(action, tds[1].xpath('a/@href')[0])

            bill.add_source(url)
            self.save_bill(bill)
Exemple #3
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table')

            # Bill
            name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[2].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Exemple #4
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect("table")

            # Bill
            name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version("Current", url.replace("/sum/", "/fulltext/"))

            # Sponsorships
            for a in tables[2].cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect("tr"):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if "/" not in senate_date and "/" not in house_date:
                    continue
                if senate_date:
                    bill.add_action("upper", action_text, senate_date)
                if house_date:
                    bill.add_action("lower", action_text, house_date)

            self.save_bill(bill)
Exemple #5
0
    def _parse_bill(self, session, chamber, source_url, line):
        if line:
            (type, combined_id, number, title, relating_to) = line.split("\xe4")
            if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'):
                #
                # basic bill info
                bill_id = "%s %s" % (type, number.zfill(4))
                bill = Bill(session, chamber, bill_id, title)
                bill.add_source(source_url)

                #
                # add actions
                if self.actionsByBill.has_key(bill_id):
                    for a in self.actionsByBill[bill_id]:
                        bill.add_action(a['actor'], a['action'], a['date'])

                if self.load_versions_sponsors:
                    # add versions and sponsors
                    versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(self, session, bill_id)
                    #print "versionsSponsors: %s" % str(versionsSponsors)
                    if versionsSponsors:
                        for ver in versionsSponsors['versions']:
                            bill.add_version(ver['name'], ver['url'])
                        sponsorType = 'primary'
                        if len(versionsSponsors['sponsors']) > 1:
                            sponsorType = 'cosponsor'
                        for name in versionsSponsors['sponsors']:
                            bill.add_sponsor(sponsorType, name)

                # save - writes out JSON
                self.save_bill(bill)
Exemple #6
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('#legislation h1')[0].text_content().strip()

            bill_id = name.split(' - ')[0].strip()

            bill = Bill(session, chamberName, bill_id, name)

            # Sponsorships
            for a in page.cssselect("#sponsors a"):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in page.cssselect('#history tr')[1:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()

                if '/' not in date:
                    continue

                date = datetime.datetime.strptime(date, '%m/%d/%Y')

                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in page.cssselect('#versions a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Exemple #7
0
    def scrape_bill(self, chamber, bill):
        bill_id = bill['id'].replace('w/','with ')

        page = lxml.html.fromstring(self.urlopen(bill['url']))
        page.make_links_absolute(bill['url'])

        title_row = page.xpath('//tr[td/b[contains(font,"Long Title")]]')[0]
        # text_content() == make sure any tags in the title don't cause issues
        title = title_row.xpath('td[@width="79%"]/font')[0].text_content() 

        # now we can create a bill object
        b = Bill(bill['session'], bill['chamber'], bill_id, title)
        b.add_source(bill['url'])

        sponsors_row = page.xpath('//tr[td/b[contains(font,"Primary Sponsor")]]')[0]
        sponsor = sponsors_row.xpath('td[@width="31%"]/font')[0].text

        if sponsor != None:
            b.add_sponsor('primary', sponsor)

        # scraping these and co-sponsors, but not doing anything with them until 
        # it's decided whether or not to attempt to split 'em up
        additional = sponsors_row.xpath('td[@width="48%"]/font')
        additional_sponsors = additional[0].text if len(additional) > 0 else ""
        additional_sponsors = additional_sponsors.replace('&nbsp&nbsp&nbsp','')

        cosponsors_row = page.xpath('//tr[td/b[contains(font,"CoSponsors")]]')[0]
        cosponsors = cosponsors_row.xpath('td[@width="79%"]/font')[0].text
        cosponsors = cosponsors if cosponsors != '{ NONE...}' else ''

        introduced_row = page.xpath('//tr[td/b[contains(font,"Introduced On")]]')
        if len(introduced_row) > 0:
            introduced = introduced_row[0].expath('/td[@width="31%"]/font')[0].text
            introduced = datetime.strptime(introduced, '%b %d, %Y')
            b.add_action(bill['chamber'], 'introduced', introduced, 'bill:introduced')

        actions = page.xpath('//table[preceding-sibling::b[contains(font,"Actions History:")]]/tr/td[@width="79%"]/font')
        if len(actions) > 0:
           actions = actions[0].text_content().split('\n') 
           for act in actions:
               act = act.partition(' - ')
               date = datetime.strptime(act[0], '%b %d, %Y')
               b.add_action(bill['chamber'], act[2], date)
        
        # resources = page.xpath('//tr[td/b[contains(font, "Full text of Legislation")]]')

        # save vote urls for scraping later
        vote_urls = []
        voting_reports = page.xpath('//tr[td/b[contains(font, "Voting Reports")]]')
        if(len(voting_reports) > 0):
            for report in voting_reports[0].xpath('td/font/a'):
                vote_urls.append(report.attrib['href'])
        
        # Scrape votes
        for url in vote_urls:
            vote = self.scrape_votes(chamber, title, bill_id, url)
            b.add_vote(vote)

        # Save bill
        self.save_bill(b)
Exemple #8
0
    def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
        try:
            doc = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e)
            self.warning("500 error for bill page; skipping bill")
            return

        # bill id, title, summary
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = bill_type or DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip()
        summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type,
                    summary=summary)

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]'))
        # don't add just yet; we can make them better using action data

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            actor = actor.text_content()
            if actor == 'House':
                actor = 'lower'
            elif actor == 'Senate':
                actor = 'upper'

            action = action.text_content()
            bill.add_action(actor, action, date,
                            **_categorize_action(action))
            if action.lower().find('sponsor') != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype, sponsor, chamber, official_type in sponsor_list:
            if chamber:
                bill.add_sponsor(spontype, sponsor,
                                 official_type=official_type, chamber=chamber)
            else:
                bill.add_sponsor(spontype, sponsor,
                                 official_type=official_type)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        # if there's more than 1 votehistory link, there are votes to grab
        if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1:
            votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
            self.scrape_votes(session, bill, votes_url)

        self.save_bill(bill)
Exemple #9
0
    def scrape(self, session, chambers):
        urlified_session_id = session.replace(':', '-')
        url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projets-loi-%s.html' % urlified_session_id
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        # scrape all the actions for this session
        actions = self.scrape_actions(urlified_session_id)

        for row in doc.xpath('//table[@id="tblListeProjetLoi"]/tbody/tr'):
            id_td, details_td = row.xpath('td')[:2]
            bill_id = clean_spaces(id_td.text_content())
            pdf_link = details_td.xpath('p[@class="lienAssocie"]//a')[0]
            bill_name = clean_spaces(pdf_link.text_content())
            pdf_url = pdf_link.xpath('@href')[0]
            detail_url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projet-loi-%s-%s.html' % (bill_id, urlified_session_id)
            bill = Bill(session, 'lower', bill_id, bill_name)
            bill.add_source(url)
            bill.add_source(detail_url)
            bill.add_source(pdf_url)
            # add actions
            for action in actions[bill_id]:
                bill.add_action('lower', action['name'], action['date'])
            # get sponsors
            self.scrape_details(bill, detail_url)
            self.save_bill(bill)
Exemple #10
0
    def scrape2001(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect("table center table")

            # Bill
            name = tables[0].text_content().split("-", 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

            # Actions
            center = page.cssselect("table center")[-1]

            for row in center.cssselect("table table")[0].cssselect("tr")[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if "/" not in date:
                    continue
                if action_text.startswith("Senate"):
                    action_text = action_text.split(" ", 1)[1].strip()
                    bill.add_action("upper", action_text, date)
                elif action_text.startswith("House"):
                    action_text = action_text.split(" ", 1)[1].strip()
                    bill.add_action("lower", action_text, date)

            # Versions
            for row in center.cssselect("table table")[1].cssselect("a"):
                bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href")))

            self.save_bill(bill)
Exemple #11
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill["subjects"] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsor("coauthor", name)
            else:
                bill.add_sponsor("author", name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            bill.add_action(actor, action, date, type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib["href"]
            if "COMMITTEE REPORTS" in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib["href"]))

        self.save_bill(bill)
Exemple #12
0
    def scrape_current(self, chamber, term):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                # main
                bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'],
                            type=btype, status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if bill_data['LONGTITLE']:
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper' if event['chamber'] == 'Senate'
                             else 'lower')

                    date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning('unknown action code on %s: %s %s' %
                                     (bill_id, event['action_code'],
                                      event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                self.scrape_html(bill)
                self.save_bill(bill)
Exemple #13
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'], chamber, data['identifier'],
                    data['title'], subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}),
                            )

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(sponsor['classification'],
                             sponsor['name'],
                             )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'], link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'], link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'],
                               chamber
                               )
        self.save_bill(bill)
Exemple #14
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {'F': 'bill', 'R':'resolution',
                         'C': 'concurrent resolution'}[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill['subjects'] = self._subject_mapping[bill_id]
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor, chamber=chamber)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip(), chamber=chamber)

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                kwargs = {}
                if 'committee' in action:
                    kwargs['committees'] = action['committees']

                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'],
                                **kwargs)

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            if 'resolution' in version_html.response.url:
                bill.add_version('resolution text', version_html.response.url,
                                 mimetype='text/html')
            else:
                version_doc = lxml.html.fromstring(version_html)
                for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'):
                    version_url = urlparse.urljoin(VERSION_URL_BASE,
                                                   v.get('href'),
                                                   mimetype='text/html')
                    bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)
Exemple #15
0
    def scrape(self, session, chambers):
        # Get the progress table.
        url = 'http://www.assembly.nl.ca/business/bills/ga47session1.htm'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        for tr in doc.xpath('//table[@class="bills"]/tr')[1:]:
            bill_id = clean_spaces(tr[0].text_content()).strip('*')
            if not bill_id:
                break # empty rows extend past actual list of bills
            if bill_id.endswith("."):
                bill_id = bill_id[:-1]

            title = clean_spaces(tr[1].text_content())
            chapter = tr[-1].text_content()

            bill = Bill(session, 'lower', bill_id, title, type='bill')

            if chapter:
                bill['chapter'] = chapter

            # FIXME need to do more work to figure out what
            # version the text *really* is
            td = tr[1]
            bill_url = td.xpath('a/@href')
            if bill_url:
                bill.add_version(url=bill_url.pop(), name='First Reading',
                    mimetype='text/html')

            # Actions and version urls.
            data = zip([
                'First Reading',
                'Second Reading',
                'Committee',
                'Amendments',
                'Third Reading',
                'Royal Assent',
                'Act'],
                tr[2:-1])

            for action, td in data:
                date_text = td.text_content()
                date = None
                fmt = r'%b. %d/%Y'
                try:
                    date = datetime.datetime.strptime(date_text, fmt)
                except ValueError:
                    continue
                else:
                    break
                if date is None:
                    continue

                attrs = dict(action=action, date=date, actor='lower')
                attrs.update(self.categorizer.categorize(action))
                bill.add_action(**attrs)

            bill.add_source(url)
            self.save_bill(bill)
Exemple #16
0
    def scrape_xml(self, chamber, session):
        start_letter = "S" if chamber == "upper" else "H"
        sponsor_type_dict = {"3": "senate cosponsor", "4": "sponsor", "5": "sponsor"}
        version_url = "http://www1.legis.ga.gov/legis/%s/versions/" % session

        summary_url = "http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml" % session
        xml = self.urlopen(summary_url)
        doc = lxml.etree.fromstring(xml)

        for bxml in doc.xpath("//Bill"):
            type = bxml.get("Type")

            # if this is from the other chamber skip it
            if not type.startswith(start_letter):
                continue

            bill_id = type + bxml.get("Num") + bxml.get("Suffix")
            if type in ("HB", "SB"):
                type = "bill"
            elif type in ("HR", "SR"):
                type = "resolution"
            else:
                raise ValueError("unknown type: %s" % type)

            # use short_title as title and long as description
            title = bxml.xpath("Short_Title/text()")[0]
            description = bxml.xpath("Title/text()")[0]

            bill = Bill(session, chamber, bill_id, title, type=type, description=description)
            bill.add_source(summary_url)

            for sponsor in bxml.xpath("Sponsor"):
                sponsor_name, code = sponsor.text.rsplit(" ", 1)
                sponsor_name = sponsor_name.replace(",", ", ")
                bill.add_sponsor(sponsor_type_dict[sponsor.get("Type")], sponsor_name, _code=code)

            for version in bxml.xpath("Versions/Version"):
                # NOTE: it is possible to get PDF versions by using .get('Id')
                # ex. URL:  legis.ga.gov/Legislation/20112012/108025.pdf
                # for now we just get HTML
                description, file_id = version.xpath("*/text()")
                bill.add_version(description, version_url + file_id)

            for action in bxml.xpath("StatusHistory/Status"):
                date = datetime.datetime.strptime(action.get("StatusDate"), "%Y-%m-%dT%H:%M:%S")
                code = action.get("StatusCode")
                if code in ("EFF", "Signed Gov"):
                    actor = "executive"
                elif code[0] == "S":
                    actor = "upper"
                elif code[0] == "H":
                    actor = "lower"

                atype = self._action_codes[code]

                bill.add_action(actor, action.text, date, atype)

            self.save_bill(bill)
Exemple #17
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = lxml.html.fromstring(self.urlopen(url))

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('coauthor', name)
            else:
                bill.add_sponsor('author', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            bill.add_action(actor, action, date,
                            type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib['href']
            if 'COMMITTEE REPORTS' in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib['href']))

        self.save_bill(bill)
Exemple #18
0
    def scrape_bill_page(self, chamber, session, bill_url, bill_type):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()")

        sbp = lambda x: self.scrape_bare_page(page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"])

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr")

        bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]]
        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsor("primary", author)
        for author in authors:
            bill.add_sponsor("cosponsor", author)

        for digest in digests:
            bill.add_document(digest.text, digest.attrib["href"], mimetype="application/pdf")

        for version in versions:
            bill.add_version(version.text, version.attrib["href"], mimetype="application/pdf")

        flags = {"prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"]}

        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            date += "/%s" % (session)  # Session is April --> June. Prefiles
            # look like they're in January at earliest.
            date = dt.datetime.strptime(date, "%m/%d/%Y")
            chamber = {"S": "upper", "H": "lower", "J": "joint"}[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            if cat == []:
                cat = ["other"]
            bill.add_action(chamber, text, date, cat)

        self.save_bill(bill)
Exemple #19
0
    def scrape_bill(self, term, bill_url):

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            
            chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text
            
            if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0:
            
                chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text

                if '*' in chamber1:
                    bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
                    secondary_bill_id = chamber2.replace(' ', '')
                else:
                    bill_id = chamber2.replace(' ', '')[1:len(chamber2)]
                    secondary_bill_id = chamber1.replace(' ', '')
                
                primary_chamber = 'lower' if 'H' in bill_id else 'upper'

            else:
                primary_chamber = 'lower' if 'H' in chamber1 else 'upper'
                bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
                secondary_bill_id = None
            
            title = page.xpath("//span[@id='lblAbstract']")[0].text

            bill = Bill(term, primary_chamber, bill_id, title, secondary_bill_id=secondary_bill_id)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                #NEED TO ADD SECONDARY ACTIONS
                bill.add_action(primary_chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)
Exemple #20
0
    def scrape_current(self, chamber, term):
        chamber_name = "Senate" if chamber == "upper" else "House"
        with self.urlopen(
            ksapi.url + "bill_status/"
        ) as bill_request:  # perhaps we should save this data so we can make on request for both chambers?
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json["content"]
            for bill_data in bills:
                # filtering out other chambers
                bill_equal_chamber = False
                for history in bill_data["HISTORY"]:
                    if history["chamber"] == chamber_name:
                        bill_is_in_chamber = True
                if not bill_is_in_chamber:
                    continue

                    # main
                bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"])
                bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower())
                if bill_data["LONGTITLE"]:
                    bill.add_title(bill_data["LONGTITLE"])
                bill.add_document("apn", ksapi.ksleg + bill_data["apn"])
                bill.add_version("Latest", ksapi.ksleg + bill_data["apn"])

                for sponsor in bill_data["SPONSOR_NAMES"]:
                    bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor)

                for event in bill_data["HISTORY"]:
                    if "committee_names" in event and "conferee_names" in event:
                        actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"])
                    elif "committee_names" in history:
                        actor = " and ".join(bill_data["committee_names"])
                    elif "conferee_names" in history:
                        actor = " and ".join(bill_data["conferee_names"])
                    else:
                        actor = "upper" if chamber == "Senate" else "lower"

                    date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S")
                    bill.add_action(actor, event["status"], date)

                    if event["action_code"] in ksapi.voted:
                        votes = votes_re.match(event["status"])
                        if votes:
                            vote = Vote(
                                chamber,
                                date,
                                votes.group(1),
                                event["action_code"] in ksapi.passed,
                                int(votes.group(2)),
                                int(votes.group(3)),
                                0,
                            )
                            vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower())
                            bill.add_vote(vote)

                self.save_bill(bill)
Exemple #21
0
    def scrape_bill(self, chamber, term, bill_id, url, title, subject=None):
        self.logger.info('GET ' + url)
        resp = self.get(url)
        html = resp.text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        bill = Bill(term, chamber, bill_id, title)
        bill.add_source(url)
        if subject is not None:
            bill['subjects'] = [subject]

        # Sponsors
        sponsor_map = {
            'author': 'primary',
            'co-author': 'cosponsor',
            'sponsor': 'cosponsor',
            'co-sponsor': 'cosponsor',
            }
        for div in doc.xpath('//div[contains(@class, "bill-author-info")]'):
            name = div.xpath('string(b)').strip()
            sp_type = sponsor_map[div.xpath('string(p)').strip().lower()]
            bill.add_sponsor(sp_type, name)

        # Actions
        for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]:
            if li.text_content() == 'None currently available.':
                continue
            chamber_str = li.xpath('string(strong)').strip()
            action_chamber = dict(H='lower', S='upper')[chamber_str]
            action_date = li.xpath('string(span[@class="document-date"])')
            # Some resolution actions have no dates.
            if not action_date.strip():
                continue
            action_date = datetime.datetime.strptime(action_date.strip(), '%m/%d/%Y')
            action_text = li.xpath('string(span[2])').strip()
            if not action_text.strip():
                continue
            kwargs = dict(date=action_date, actor=action_chamber, action=action_text)
            kwargs.update(**self.categorizer.categorize(action_text))
            bill.add_action(**kwargs)

        # Documents (including votes)
        for doc_type, doc_meta in BillDocuments(self, doc):
            if doc_type == 'version':
                bill.add_version(
                    doc_meta.title or doc_meta.text, url=doc_meta.url,
                    mimetype='application/pdf')
            elif doc_type == 'document':
                bill.add_document(doc_meta.title or doc_meta.text, url=doc_meta.url,
                    mimetype='application/pdf')
            elif doc_type == 'rollcall':
                self.add_rollcall(chamber, bill, doc_meta)

        self.save_bill(bill)
Exemple #22
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = '%s/bill.doc' % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext('p').contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version",
                             session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile(
                    'legislator/[SH]\d+\.htm'))

            for sponsor_link in sponsor_links:
                bill.add_sponsor('primary', sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext('p')[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = action.split('-')[0]
                action_date = dt.datetime.strptime(action_date, '%b %d')
                # Fix:
                action_date = action_date.replace(
                    year=int('20' + session[2:4]))

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
            if vote_link:
                bill.add_document(
                    'vote_history.pdf',
                    bill_info_url.replace('.htm', '') + "/vote_history.pdf")

            self.save_bill(bill)
Exemple #23
0
    def scrape_bill(self, chamber, session, billid, histurl, year):
        if year[0] != 'R':
            session = year
        else:
            session = self.metadata['session_details'][year][
                'sub_sessions'][int(year[0]) - 1]

        with self.urlopen(histurl) as data:
            soup = BeautifulSoup(cleansource(data))
            basicinfo = soup.findAll('div', id='bhistleft')[0]
            hist = basicinfo.table

            sponsor = None
            title = None
            for b in basicinfo.findAll('b'):
                if b.next.startswith('SUMMARY'):
                    title = b.findNextSiblings(text=True)[0].strip()
                elif b.next.startswith('SPONSOR'):
                    for a in b.findNextSiblings('a'):
                        if not issponsorlink(a):
                            break
                        sponsor = cleansponsor(a.contents[0])

            bill = Bill(session, chamber, billid, title)

            if sponsor:
                bill.add_sponsor('primary', sponsor)

            for row in hist.findAll('tr'):
                link = row.td.a
                vlink = urlbase % link['href']
                vname = link.contents[0].strip()
                bill.add_version(vname, vlink)

            history = soup.findAll('div', id='bhisttab')[0].table
            rows = history.findAll('tr')[1:]
            for row in rows:
                tds = row.findAll('td')
                if len(tds) < 2:
                    # This is not actually an action
                    continue
                date, action = row.findAll('td')[:2]
                date = dt.datetime.strptime(date.contents[0], '%m/%d/%y')
                action = action.contents[0].strip()
                if 'House' in action:
                    actor = 'lower'
                elif 'Senate' in action:
                    actor = 'upper'
                else:  # for lack of a better
                    actor = chamber

                bill.add_action(actor, action, date)

        self.save_bill(bill)
Exemple #24
0
    def scrape_bill(self, session, chamber, bill_type, bill_url):
        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(bill_url)

            # split "SB1 SD2 HD2" to get SB1
            bill_id = page.xpath('//a[@id="LinkButtonMeasure"]')[0].text_content().split()[0]

            title = page.xpath('//span[@id="ListView1_ctrl0_measure_titleLabel"]')[0].text
            subjects = page.xpath('//span[@id="ListView1_ctrl0_report_titleLabel"]')[0].text.split('; ')
            subjects = [s.strip() for s in subjects if s.strip()]
            description = page.xpath('//span[@id="ListView1_ctrl0_descriptionLabel"]')[0].text
            sponsors = page.xpath('//span[@id="ListView1_ctrl0_introducerLabel"]')[0].text
            referral = page.xpath('//span[contains(@id, "referral")]/text()')[0]

            bill = Bill(session, chamber, bill_id, title, subjects=subjects,
                        type=bill_type, description=description, referral=referral)
            for sponsor in sponsors.split(', '):
                if sponsor.endswith(' (BR)'):
                    sponsor = sponsor[:-5]
                bill.add_sponsor('primary', sponsor)

            # actions
            actions = []

            table = page.xpath('//table[@id="GridViewStatus"]')[0]
            for row in table.xpath('tr'):
                action_params = {}
                cells = row.xpath('td')
                if len(cells) == 3:
                    ch = cells[1].xpath('font')[0].text
                    action_params['actor'] = house[ch]
                    action_params['action'] = cells[2].xpath('font')[0].text
                    action_date = cells[0].xpath('font')[0].text
                    action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y")
                    action_params['type'] = categorize_action(action_params['action'])
                    actions.append(action_params)
            for action_params in actions:
                bill.add_action(**action_params)

                self.parse_vote(bill, action_params['action'],
                                action_params['actor'], action_params['date'])

            # add versions
            try:
                for version in page.xpath('//a[contains(@id, "StatusLink")]'):
                    bill.add_version(version.text.replace('_', ' '),
                                     version.get('href'))
            except IndexError: # href not found.
                pass

        bill.add_source(bill_url)
        self.save_bill(bill)
Exemple #25
0
 def scrape_bill_status_page(self, url, params={}):
     """Scrapes the status page url, populating parameter dict and
     returns bill
     """
     with self.urlopen(url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(url)
         params["bill_id"] = page.xpath('//h3[contains(@class, "center")]/a')[0].text.split()[0]
         params["title"] = page.xpath(
             '//div[div[contains( \
             ., "Report Title")]]/div[contains(@class, "rightside")]'
         )[0].text.strip()
         sponsors = page.xpath(
             '//div[div[contains( \
             ., "Introducer")]]/div[contains(@class, "rightside")]'
         )[0].text
         subject = page.xpath(
             '//div[div[contains( \
             ., "Measure Title")]]/div[contains(@class, "rightside")]'
         )[0].text.strip()
         subject = subject.replace("RELATING TO ", "")  # Remove lead text
         params["subject"] = subject.replace(".", "")
         params["description"] = page.xpath(
             '//div[div[contains( \
             ., "Description")]]/div[contains(@class, "rightside")]'
         )[0].text
         params["companion"] = page.xpath(
             '//div[div[contains( \
             ., "Companion")]]/div[contains(@class, "rightside")]'
         )[0].text
         if params["title"] == "":
             params["title"] = params["subject"]
         actions = []
         table = page.xpath('//table[tr/th[contains(., "Date")]]')[0]
         for row in table.xpath("tr[td]"):  # Ignore table header row
             action_params = {}
             cells = row.xpath("td")
             if len(cells) == 3:
                 ch = cells[1].text
                 action_params["actor"] = house[ch]
                 action_params["action"] = cells[2].text
                 action_date = cells[0].text.split()[0]  # Just get date, ignore any time.
                 try:
                     action_params["date"] = datetime.strptime(action_date, "%m/%d/%y")
                 except ValueError:  # Try a YYYY format.
                     action_params["date"] = datetime.strptime(action_date, "%m/%d/%Y")
                 actions.append(action_params)
         bill = Bill(**params)
         bill.add_sponsor("primary", sponsors)
         for action_params in actions:
             bill.add_action(**action_params)
     self.save_bill(bill)
     return bill
Exemple #26
0
    def scrape_bill(self, chamber, session, doc_type, url):
        doc = self.url_to_doc(url)
        # bill id, title, synopsis
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip()
        synopsis = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type,
                    synopsis=synopsis)

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]'))
        # don't add just yet; we can make them better using action data

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            actor = actor.text_content()
            if actor == 'House':
                actor = 'lower'
            elif actor == 'Senate':
                actor = 'upper'

            action = action.text_content()
            bill.add_action(actor, action, date,
                            **_categorize_action(action))
            if action.lower().find('sponsor') != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype,sponsor,chamber in sponsor_list:
            if chamber:
                bill.add_sponsor(spontype, sponsor, chamber=chamber)
            else:
                bill.add_sponsor(spontype, sponsor)


        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        # if there's more than 1 votehistory link, there are votes to grab
        if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1:
            votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
            self.scrape_votes(session, bill, votes_url)

        self.save_bill(bill)
Exemple #27
0
    def scrape_bill(self, session, chamber, bill_type, bill_url):
        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(bill_url)

            # split "SB1 SD2 HD2" to get SB1
            bill_id = page.xpath('//a[@class="headerlink"]')[0].text.split()[0]

            table = page.xpath('//table[@cellspacing="4px"]')[0]

            title = get_table_text(table, "Measure Title")
            subjects = get_table_text(table, "Report Title").split('; ')
            description = get_table_text(table, "Description")
            sponsors = get_table_text(table, "Introducer(s)")

            bill = Bill(session, chamber, bill_id, title, subjects=subjects,
                        type=bill_type, description=description)
            for sponsor in sponsors.split(', '):
                if sponsor.endswith(' (BR)'):
                    sponsor = sponsor[:-5]
                bill.add_sponsor('primary', sponsor)

            # actions
            actions = []

            table = page.xpath('//table[contains(@id, "GridView1")]')[0]
            for row in table.xpath('tr'):
                action_params = {}
                cells = row.xpath('td')
                if len(cells) == 3:
                    ch = cells[1].xpath('font')[0].text
                    action_params['actor'] = house[ch]
                    action_params['action'] = cells[2].xpath('font')[0].text
                    action_date = cells[0].xpath('font')[0].text
                    action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y")
                    action_params['type'] = categorize_action(action_params['action'])
                    actions.append(action_params)
            for action_params in actions:
                bill.add_action(**action_params)

                self.parse_vote(bill, action_params['action'],
                                action_params['actor'], action_params['date'])

            # Add version document if not on a javascript link.
            try:
                bill_version = page.xpath('//a[contains(@id, "HyperLinkPDF")]')[0].attrib['href']
                bill.add_version('Current version', bill_version)
            except IndexError: # href not found.
                pass

        bill.add_source(bill_url)
        self.save_bill(bill)
Exemple #28
0
    def scrape(self, session, chambers):
        url = 'http://www.legassembly.sk.ca/legislative-business/bills/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        url = doc.xpath('//a[text() = "Progress of Bills"]/@href').pop()
        filename, resp = self.urlretrieve(url)

        doc = pdf_to_lxml(filename)

        actions = [
            'First Reading',
            'Crown recommendation',
            'Committee',
            'Second Reading',
            'Committee',
            'Amend Date',
            'Third Reading',
            'Royal Assent',
            'In Effect'
            ]

        for a in doc.xpath('//a[contains(@href, "legdocs/Bills")]'):
            bill_id = a.text_content().strip()
            predicate = lambda el: el.tag == 'br'
            sibs = list(takewhile(predicate, a.itersiblings()))

            # If the star is missing, insert it to avoid complicated code.
            if not sibs[0].tail.strip() == '*':
                sibs.insert(0, DummyBR('br', None, '*'))

            title_chunks = [sibs[1].tail.strip()]
            sponsor = sibs[2].tail.strip()
            dates = sibs[3].tail.split(u'\xa0')
            title_chunks.extend((br.tail or '').strip() for br in sibs[4:])
            title = ' '.join(title_chunks).strip()

            bill = Bill(session, 'lower', bill_id, title, type='bill')
            bill.add_sponsor(name=sponsor, type='primary')

            for action, date in zip(actions, dates):
                date = datetime.datetime.strptime(date.strip(), '%Y-%m-%d')
                attrs = dict(action=action, date=date, actor='lower')
                attrs.update(self.categorizer.categorize(action))
                bill.add_action(**attrs)

            bill.add_source(url)
            bill.add_version('Introduced', a.attrib['href'],
                mimetype='application/pdf')
            self.save_bill(bill)
Exemple #29
0
    def scrape_bill(self, chamber, session, doc_type, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # bill id, title, synopsis
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip()
        synopsis = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type,
                    synopsis=synopsis)

        # sponsors
        for sponsor in doc.xpath('//a[@class="content"]/text()'):
            bill.add_sponsor('cosponsor', sponsor)

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            actor = actor.text_content()
            if actor == 'House':
                actor = 'lower'
            elif actor == 'Senate':
                actor = 'upper'

            action = action.text_content()

            # TODO: categorize actions

            bill.add_action(actor, action, date)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        # if there's more than 1 votehistory link, there are votes to grab
        if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1:
            votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
            self.scrape_votes(bill, votes_url)
            bill.add_source(votes_url)

        bill.add_source(url)
        self.save_bill(bill)
Exemple #30
0
    def scrape1995(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip()
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'),
                             mimetype='text/html')

            # Sponsorships
            rows = page.cssselect('center table tr')
            for row in rows:
                if row.text_content().strip() == 'Sponsor and CoSponsors':
                    continue
                if row.text_content().strip() == 'Links / Committees / Status':
                    break
                for a in row.cssselect('a'):
                    bill.add_sponsor('', a.text_content().strip())

            # Actions
            # The actions are in a pre table that looks like:
            """    SENATE                         HOUSE
                   -------------------------------------
                 1/13/95   Read 1st time          2/6/95
                 1/31/95   Favorably Reported
                 2/1/95    Read 2nd Time          2/7/95
                 2/3/95    Read 3rd Time
                 2/3/95    Passed/Adopted                   """

            actions = page.cssselect('pre')[0].text_content().split('\n')
            actions = actions[2:]
            for action in actions:
                senate_date = action[:22].strip()
                action_text = action[23:46].strip()
                house_date = action[46:].strip()

                if '/' not in senate_date and '/' not in house_date:
                    continue

                if senate_date:
                    bill.add_action('upper', action_text, senate_date)

                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Exemple #31
0
    def scrape_current(self, chamber, term):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                # main
                bill = Bill(term,
                            chamber,
                            bill_id,
                            bill_data['SHORTTITLE'],
                            type=btype,
                            status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if bill_data['LONGTITLE']:
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper'
                             if event['chamber'] == 'Senate' else 'lower')

                    date = datetime.datetime.strptime(
                        event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning(
                            'unknown action code on %s: %s %s' %
                            (bill_id, event['action_code'], event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                self.scrape_html(bill)
                self.save_bill(bill)
Exemple #32
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('cosponsor', name)
            else:
                bill.add_sponsor('primary', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            attrs = dict(actor=actor, action=action, date=date)
            attrs.update(**self.categorizer.categorize(action))
            bill.add_action(**attrs)

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib['href']
            if 'COMMITTEE REPORTS' in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url, mimetype='application/msword')

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = (bill['title'] == "Short Title Not Found.")
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            self.save_bill(bill)
Exemple #33
0
    def get_bill_info(self, session, bill_id):
        bill_detail_url = 'http://www.ncga.state.nc.us/gascripts/'\
            'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s' % (
            session, bill_id)

        if bill_id[0] == 'H':
            chamber = 'lower'
        else:
            chamber = 'upper'

        # parse the bill data page, finding the latest html text
        with self.urlopen(bill_detail_url) as data:
            doc = lxml.html.fromstring(data)

            title_div_txt = doc.xpath('//div[@id="title"]/text()')[0]
            if 'Joint Resolution' in title_div_txt:
                bill_type = 'joint resolution'
                bill_id = bill_id[0] + 'JR ' + bill_id[1:]
            elif 'Resolution' in title_div_txt:
                bill_type = 'resolution'
                bill_id = bill_id[0] + 'R ' + bill_id[1:]
            elif 'Bill' in title_div_txt:
                bill_type = 'bill'
                bill_id = bill_id[0] + 'B ' + bill_id[1:]

            title_style_xpath = '//div[@style="text-align: center; font: bold 20px Arial; margin-top: 15px; margin-bottom: 8px;"]/text()'
            bill_title = doc.xpath(title_style_xpath)[0]

            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill.add_source(bill_detail_url)

            # skip first PDF link (duplicate link to cur version)
            if chamber == 'lower':
                link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]'
            else:
                link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]'
            for vlink in doc.xpath(link_xpath)[1:]:
                # get the name from the PDF link...
                version_name = vlink.text.replace(u'\xa0', ' ')
                # but neighboring span with anchor inside has the HTML version
                version_url = vlink.xpath('./following-sibling::span/a/@href')
                version_url = 'http://www.ncga.state.nc.us' + version_url[0]
                bill.add_version(version_name, version_url)

            # sponsors
            pri_td = doc.xpath('//th[text()="Primary:"]/following-sibling::td')
            pri_text = pri_td[0].text_content().replace(u'\xa0',
                                                        ' ').split('; ')
            for leg in pri_text:
                leg = leg.strip()
                if leg:
                    if leg[-1] == ';':
                        leg = leg[:-1]
                    bill.add_sponsor('primary', leg)

            # cosponsors
            co_td = doc.xpath('//th[text()="Co:"]/following-sibling::td')
            co_text = co_td[0].text_content().replace(u'\xa0', ' ').split('; ')
            for leg in co_text:
                leg = leg.strip()
                if leg and leg != 'N/A':
                    if leg[-1] == ';':
                        leg = leg[:-1]
                    bill.add_sponsor('cosponsor', leg)

            # actions
            action_tr_xpath = '//td[starts-with(text(),"History")]/../../tr'
            # skip two header rows
            for row in doc.xpath(action_tr_xpath)[2:]:
                tds = row.xpath('td')
                act_date = tds[0].text
                actor = tds[1].text or ''
                action = tds[2].text.strip()

                act_date = dt.datetime.strptime(act_date, '%m/%d/%Y')

                if actor == 'Senate':
                    actor = 'upper'
                elif actor == 'House':
                    actor = 'lower'
                else:
                    actor = 'executive'

                for pattern, atype in self._action_classifiers.iteritems():
                    if action.startswith(pattern):
                        break
                else:
                    atype = 'other'

                bill.add_action(actor, action, act_date, type=atype)

            if self.is_latest_session(session):
                subj_key = bill_id[0] + ' ' + bill_id.split(' ')[-1]
                bill['subjects'] = self.subject_map[subj_key]

            self.save_bill(bill)
Exemple #34
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for current year
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.urlopen(url)
        # if first page isn't found, try second year
        if 'Page Not Found' in html:
            html = self.urlopen('http://legislature.mi.gov/doc.aspx?%s-%s' %
                                (session[-4:], bill_id.replace(' ', '-')))
            if 'Page Not Found' in html:
                return None

        doc = lxml.html.fromstring(html)

        title = doc.xpath(
            '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(session=session,
                    chamber=chamber,
                    bill_id=bill_id,
                    title=title,
                    type=bill_type)
        bill.add_source(url)

        # sponsors
        sp_type = 'primary'
        for sponsor in doc.xpath(
                '//span[@id="frg_billstatus_SponsorList"]/a/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ')
            bill.add_sponsor(sp_type, sponsor)
            sp_type = 'cosponsor'

        bill['subjects'] = doc.xpath(
            '//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath(
                '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = datetime.datetime.strptime(date, "%m/%d/%Y")
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            type = categorize_action(action)
            bill.add_action(actor, action, date, type=type)

            # check if action mentions a vote
            rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    vote = Vote(actor, date, action, False, 0, 0, 0)
                    self.parse_roll_call(vote, vote_url, rc_num)

                    # check the expected counts vs actual
                    count = re.search('YEAS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['yes_votes']):
                        self.warning(
                            'vote count mismatch for %s %s, %d != %d' %
                            (bill_id, action, count, len(vote['yes_votes'])))
                    count = re.search('NAYS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['no_votes']):
                        self.warning(
                            'vote count mismatch for %s %s, %d != %d' %
                            (bill_id, action, count, len(vote['no_votes'])))

                    vote['yes_count'] = len(vote['yes_votes'])
                    vote['no_count'] = len(vote['no_votes'])
                    vote['other_count'] = len(vote['other_votes'])
                    vote['passed'] = vote['yes_count'] > vote['no_count']
                    vote.add_source(vote_url)
                    bill.add_vote(vote)
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath(
                '//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            version = self.parse_doc_row(row)
            if version:
                if version[1].endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif version[1].endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version(*version, mimetype=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)

        self.save_bill(bill)
        return True
Exemple #35
0
    def scrape(self, chamber, session):
        # for the chamber of the action
        chamber_map = {'House': 'lower', 'Senate':'upper', 'Joint': 'joint'}

        session_slug = session[:-2]
        chamber_slug = 'House' if chamber == 'lower' else 'Senate'

        # keep track of how many we've had to skip
        skipped = 0

        for n in itertools.count(1):
            bill_id = '%s%05d' % (chamber_slug[0], n)
            bill_url = 'http://www.malegislature.gov/Bills/%s/%s/%s' % (
                session_slug, chamber_slug, bill_id)

            with self.urlopen(bill_url) as html:
                # sometimes the site breaks
                if '</html>' not in html:
                    self.warning('truncated page on %s' % bill_url)
                    continue

                # lets assume if 10 bills are missing we're done
                if skipped == 10:
                    break

                if 'Unable to find the Bill requested' in html:
                    skipped += 1
                    # no such bill
                    continue
                else:
                    skipped = 0


                doc = lxml.html.fromstring(html)
                doc.make_links_absolute('http://www.malegislature.gov/')

                title = doc.xpath('//h2/text()')[0]
                desc = doc.xpath('//p[@class="billShortDesc"]/text()')[0]

                # create bill
                bill = Bill(session, chamber, bill_id, title, description=desc)
                bill.add_source(bill_url)

                # actions
                for act_row in doc.xpath('//tbody[@class="bgwht"]/tr'):
                    date = act_row.xpath('./td[@headers="bDate"]/text()')[0]
                    date = datetime.strptime(date, "%m/%d/%Y")
                    actor_txt = act_row.xpath('./td[@headers="bBranch"]')[0].text_content().strip()
                    if actor_txt:
                        actor = chamber_map[actor_txt]
                    action = act_row.xpath('./td[@headers="bAction"]/text()')[0].strip()
                    atype = classify_action(action)
                    bill.add_action(actor, action, date, type=atype)

                # I tried to, as I was finding the sponsors, detect whether a
                # sponsor was already known. One has to do this because an author
                # is listed in the "Sponsors:" section and then the same person
                # will be listed with others in the "Petitioners:" section. We are
                # guessing that "Sponsors" are authors and "Petitioners" are
                # co-authors. Does this make sense?

                sponsors = dict((a.get('href'), a.text) for a in
                                doc.xpath('//p[@class="billReferral"]/a'))
                petitioners = dict((a.get('href'), a.text) for a in
                                   doc.xpath('//div[@id="billSummary"]/p[1]/a'))

                # remove sponsors from petitioners
                for k in sponsors:
                    petitioners.pop(k, None)

                for sponsor in sponsors.values():
                    bill.add_sponsor('primary', sponsor)
                for petitioner in petitioners.values():
                    bill.add_sponsor('cosponsor', petitioner)

                # sometimes version link is just missing
                bill_text_url = doc.xpath('//a[@title="Show and Print Bill Text"]/@href')
                if bill_text_url:
                    bill.add_version('Current Text', bill_text_url[0])

                self.save_bill(bill)
Exemple #36
0
    def scrape(self, chamber, session):
        # URL building
        if chamber == 'upper':
            url_chamber_name = 'senate'
            norm_chamber_name = 'Senate'
        else:
            url_chamber_name = 'house'
            norm_chamber_name = 'House'

        assembly_url = '/assembly/%s' % session

        chamber_url = '/bill-text/%s-bill.html' % (url_chamber_name)

        list_url = self.site_root + assembly_url + chamber_url

        # Parsing
        with self.urlopen(list_url) as data:
            soup = self.parser.parse(data)

            if not soup:
                raise ScrapeError('Failed to parse legaslative list page.')

            table = soup.find('table', summary=norm_chamber_name + ' Bills')

            bill_links = table.findAll('a', href=re.compile('bill-actions'))
            indexed_bills = {}

            for link in bill_links:
                # Populate base attributes
                attributes = {
                    'session': session,
                    'chamber': chamber,
                }

                bill_number = link.contents[0]

                if not re.match('^[0-9]{4}$', bill_number):
                    raise ScrapeError('Bill number not in expected format.')

                # ND bill prefixes are coded numerically
                if bill_number[0] == '1':
                    bill_prefix = 'HB'
                elif bill_number[0] == '2':
                    bill_prefix = 'SB'
                elif bill_number[0] == '3':
                    bill_prefix = 'HCR'
                elif bill_number[0] == '4':
                    bill_prefix = 'SCR'
                elif bill_number[0] == '5':
                    bill_prefix = 'HR'
                elif bill_number[0] == '6':
                    bill_prefix = 'SR'
                elif bill_number[0] == '7':
                    bill_prefix = 'HMR'
                elif bill_number[0] == '8':
                    bill_prefix = 'SMR'

                attributes['bill_id'] = bill_prefix + ' ' + bill_number

                # Skip duplicates (bill is listed once for each version)
                if attributes['bill_id'] in indexed_bills.keys():
                    continue

                self.debug(attributes['bill_id'])

                # Parse details page
                attributes.update(
                    self.scrape_bill_details(assembly_url, bill_number))

                # Create bill
                bill = Bill(**attributes)

                # Parse actions
                (actions, actions_url) = self.scrape_bill_actions(
                    assembly_url, bill_number, session)
                bill.add_source(actions_url)

                for action in actions:
                    bill.add_action(**action)

                # Parse versions
                (versions, versions_url) = self.scrape_bill_versions(
                    assembly_url, bill_number)
                bill.add_source(versions_url)

                for version in versions:
                    bill.add_version(**version)

                # Add bill to dictionary, indexed by its id
                indexed_bills[attributes['bill_id']] = bill

            # Parse sponsorship data

            (sponsors, sponsors_url) = self.scrape_bill_sponsors(assembly_url)

            for bill_id, sponsor_list in sponsors.items():
                for sponsor in sponsor_list:
                    # Its possible a bill was misnamed somewhere... but thats
                    # not a good enough reason to error out
                    if bill_id in indexed_bills.keys():
                        bill = indexed_bills[bill_id]
                        bill.add_sponsor(**sponsor)
                        bill.add_source(sponsors_url)

            # Save bill
            for bill in indexed_bills.values():
                self.save_bill(bill)
Exemple #37
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title,
         (prefix, number, active_version)) = details

        bill = Bill(session,
                    bill_chamber,
                    bill_id,
                    title,
                    type=bill_type,
                    summary=bill_data['summary'])

        if bill_data['title'] is None:
            bill['title'] = bill_data['summary']

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor']['rules'] == True:
            bill.add_sponsor('primary',
                             'Rules Committee',
                             chamber=bill_chamber)
        elif not bill_data['sponsor']['budget']:
            primary_sponsor = bill_data['sponsor']['member']
            bill.add_sponsor('primary', primary_sponsor['shortName'])

            # There *shouldn't* be cosponsors if there is no sponsor.
            cosponsors = bill_active_version['coSponsors']['items']
            for cosponsor in cosponsors:
                bill.add_sponsor('cosponsor', cosponsor['shortName'])

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Determine companion bill chamber.
            companion_bill_prefix = self._parse_bill_number(
                same_as['items'][0]['basePrintNo'])[0]
            companion_bill_chamber = self._parse_bill_prefix(
                companion_bill_prefix)[0]

            # Attach companion bill data.
            bill.add_companion(
                companion_bill_id,
                companion_bill_session,
                companion_bill_chamber,
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(
                action['date'], '%Y-%m-%d')
            action_date = action_datetime.date()
            types, attrs = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(chamber,
                            action['text'],
                            action_date,
                            type=types,
                            **attrs)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                vote = self._parse_senate_votes(vote_data)
                bill.add_vote(vote)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()
            assembly_bill_data = assembly.bill

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.iteritems():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version(html_version,
                             html_url,
                             on_duplicate='use_new',
                             mimetype='text/html')

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version(pdf_version,
                             pdf_url,
                             on_duplicate='use_new',
                             mimetype='application/pdf')

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        bill.add_source(self.api_client.root + self.api_client.\
            resources['bill'].format(
                session_year=session,
                bill_id=bill_id,
                summary='',
                detail=''))
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        return bill
Exemple #38
0
    def scrape(self, session, chambers):
        HTML_TAGS_RE = r'<.*?>'

        year_slug = session[5:]

        # Load all bills and resolutions via the private API
        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
                'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
                format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.iteritems()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info['BillNumber']))

            # Create the bill using its basic information
            bill = Bill(session=session,
                        bill_id=info['BillNumber'],
                        title=info['Title'],
                        chamber=bill_chamber,
                        type=bill_type)
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = \
                    'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                    format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li')
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                        replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsor(sponsor_type, sponsor_name)

            # Capture bill text versions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a')
            for version in versions:
                bill.add_version(name=version.xpath('text()')[0],
                                 url=version.xpath('@href')[0].replace(
                                     ' ', '%20'),
                                 mimetype='application/pdf')

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format(
                        year_slug), lxml.etree.tostring(doc)).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".\
                        format(info['BillNumber']))
                self.save_bill(bill)
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v.strip() for k, v in action.iteritems()}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'governor'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    assert chambers_passed == set("HS")
                    action_type = 'governor:signed'
                elif actor == 'lower' and \
                        any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')):
                    action_type = 'bill:passed'
                    chambers_passed.add("H")
                elif actor == 'upper' and \
                        any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')):
                    action_type = 'bill:passed'
                    chambers_passed.add("S")
                else:
                    action_type = 'other'

                bill.add_action(actor=actor,
                                action=re.sub(HTML_TAGS_RE, "",
                                              action['FullStatus']),
                                date=datetime.datetime.strptime(
                                    action['StatusDate'], '%m/%d/%Y'),
                                type=action_type)

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\
                        format(year_slug, roll_call_id)
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_other = []
                for member in roll_call:
                    (member_name,
                     _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_other.append(member_name)

                if "Passed -- " in vote['FullStatus']:
                    did_pass = True
                elif "Failed -- " in vote['FullStatus']:
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = \
                        int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = \
                        int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = Vote(chamber=('lower' if vote['ChamberCode']
                                            == 'H' else 'upper'),
                                   date=datetime.datetime.strptime(
                                       vote['StatusDate'], '%m/%d/%Y'),
                                   motion=re.sub(HTML_TAGS_RE, "",
                                                 vote['FullStatus']).strip(),
                                   passed=did_pass,
                                   yes_count=yea_count,
                                   no_count=nay_count,
                                   other_count=len(roll_call_other))
                vote_to_add.add_source(roll_call_url)

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_other:
                    vote_to_add.other(member)

                try:
                    vote_to_add.validate()
                except ValueError as e:
                    self.warning(e)

                bill.add_vote(vote_to_add)

            # Capture extra information
            # This is not in the OpenStates spec, but is available
            # Not yet implemented
            # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            self.save_bill(bill)
Exemple #39
0
    def scrape_details(self, bill_detail_url, session, chamber, bill_id, page):

        data = page

        pat1 = re.compile(r'</FORM>')
        results = pat1.search(data)
        if not results:
            raise ScrapeError("scrape_details(1) - unable to parse |%s|" %
                              bill_detail_url)

        pre_start = page.find("<pre>", results.start())
        if pre_start == -1:
            self.warning(
                "scrape_details(2) - unable to parse (no <pre>) |%s|\n|%s|" %
                (bill_detail_url, page))
            return

        pre_stop = page.find("</pre>", pre_start)
        if pre_stop == -1:
            raise ScrapeError(
                "scrape_details(3) - unable to parse (no </pre>) %s" %
                bill_detail_url)

        pre_section = page[pre_start:pre_stop]

        data = pre_section
        vurl = None

        action_line_re = re.compile(r'(\d\d/\d\d/\d\d)\s+(\w+)\s+(.+)')

        pat2 = re.compile(r' By ')
        results = pat2.search(data)
        if results != None:
            bystuff = data[results.start():results.end()]
            data = data[results.end():]

        pat3 = re.compile(r'</b>')
        results1 = pat3.search(data)

        newspon = []
        if results != None and results1 != None:
            spondata = data[:results1.start()]
            mysponsors = sponsorsToList(spondata)
            for s in mysponsors:
                newspon.append(s)
            data = data[results1.end():]

        apat = re.compile(">(H|S) (\d*)<")
        billpat = re.compile("(\d+)")
        bill_number = billpat.search(bill_id).group(0)

        (similar_bills, summary, after_summary,
         vurl) = self.split_page_into_parts(data, session, bill_number)

        bill_summary = summary.strip().decode('utf8', 'ignore')

        bill = Bill(session,
                    chamber,
                    bill_id,
                    bill_summary,
                    type=bill_type(bill_summary))

        linenum = 0
        for line in after_summary.splitlines():
            #get rid of the parenthesis
            action_line = line.partition("(")[0].strip()
            #r1 = action_line_re.search(action_line)
            r = action_line_re.search(action_line)

            if r:
                the_date = r.group(1)
                action_chamber = r.group(2)
                action = r.group(3)

                date = datetime.datetime.strptime(the_date, "%m/%d/%y")
                date = date.date()

                t = action_type(action)
                if t == ['other']:
                    self.debug("OTHERACTION: bill %s %d Text[%s] line[%s]" %
                               (bill_id, linenum, action, line))
                else:
                    self.debug("ACTION: %s %d dt|ch|action [%s|%s|%s] [%s]" %
                               (bill_id, linenum, the_date, action_chamber,
                                action, str(t)))

                bill.add_action(chamber, action, date, t)
            elif len(line) > 0:
                self.debug("Skipping line %d [%s] line:[%s]" %
                           (linenum, bill_id, line))

            linenum += 1

        if similar_bills:
            bill['similar'] = similar_bills

        bill.add_source(bill_detail_url)

        for sponsor in newspon:
            bill.add_sponsor("sponsor", sponsor)

        if vurl:
            try:
                self.scrape_vote_history(vurl, chamber, bill, bill_id)
                bill.add_source(vurl)
                self.debug("Scraped votes: (chamber=%s,bill=%s,url=%s)" %
                           (chamber, bill_id, vurl))
            except Exception as error:
                self.warning(
                    "Failed to scrape votes: chamber=%s bill=%s vurl=%s %s" %
                    (chamber, bill_id, vurl, traceback.format_exc()))

        self.save_bill(bill)
Exemple #40
0
    def scrape_bill(self, link, chamber, session):
        legislation_types = {
            'House Bill': 'HB',
            'House Concurrent Resolution': 'HCR',
            'House Joint Resolution': 'HJR',
            'House Resolution': 'HR',
            'Senate Bill': 'SB',
            'Senate Concurrent Resolution': 'SCR',
            'Senate Joint Resolution': 'SJR',
            'Senate Resolution': 'SR',
        }

        base_url = "http://legis.delaware.gov"
        text_base_url = "http://legis.delaware.gov/LIS/lis{session}.nsf/vwLegislation/{bill_id}/$file/legis.html?open"
        try:
            page = self.lxmlize(link)
        except scrapelib.HTTPError:
            self.logger.warning('404. Apparently the bill hasn\'t been posted')
            return
        nominee = page.xpath(".//div[@id='page_header']/text()")[0]
        if nominee.strip().lower() == "nominee information":
            self.logger.info("Nominee, skipping")
            return

        bill_id = page.xpath(".//div[@align='center']")
        try:
            bill_id = bill_id[0].text_content().strip()
        except IndexError:
            self.logger.warning("Can't find bill number, skipping")
            return

        #some bill_ids include relevant amendments
        #in the form "SB 10 w/SA1", so we fix it here
        bill_id = bill_id.split("w/")[0]
        bill_id = bill_id.split("(")[0]

        leg_type = None
        for long_name, short_name in legislation_types.items():
            if long_name in bill_id:
                leg_type = short_name
                bill_num = bill_id.replace(long_name, "").strip()
                break
        if leg_type:
            bill_id = leg_type + " " + bill_num
        elif "for" in bill_id:
            bill_id = bill_id.split("for")[1]
        else:
            self.logger.warning("Unknown bill type for {}".format(bill_id))
            return

        bill_id = bill_id.replace('&nbsp', "")
        bill_id = bill_id.strip()

        #each row is in its own table
        #there are no classes/ids or anything, so we're going to loop
        #through the individual tables and look for keywords
        #in the first td to tell us what we're looking at
        tables = page.xpath('.//div[@id="page_content"]/table')

        bill_documents = {}
        action_list = []
        vote_documents = {}
        sub_link = None
        bill_text_avail = False

        for table in tables:
            tds = table.xpath('.//td')
            if len(tds) == 0:
                #some kind of empty table for formatting reasons
                continue
            title_text = tds[0].text_content().strip().lower()

            if title_text.startswith('primary sponsor'):
                pri_sponsor_text = tds[1].text_content()
                primary_sponsors = self.separate_names(pri_sponsor_text)
                #sometimes additional sponsors are in a 3rd td
                #other times the 3rd td contains a blank image
                addl_sponsors = []
                add_spons_text = tds[2].text_content().strip()
                if add_spons_text:
                    add_spons_text = add_spons_text.replace(
                        "Additional Sponsor(s):", "")
                    if not "on behalf of all representatives" in add_spons_text.lower(
                    ):
                        addl_sponsors = self.separate_names(add_spons_text)

            elif title_text.startswith('co-sponsor'):
                cosponsor_text = tds[1].text_content()
                if "none..." in cosponsor_text.lower():
                    cosponsors = []
                    continue
                cosponsors = self.separate_names(cosponsor_text)

            elif title_text.startswith('long title'):
                bill_title = tds[1].text_content().strip()

            elif title_text.startswith('amendment'):
                amendments = tds[1].xpath('.//a')
                for a in amendments:
                    amm = a.text
                    self.logger.debug(amm)
                    amm_text = "Amendment".format(amm.strip())
                    amm_slg = "+".join(amm.split())
                    amm_link = text_base_url.format(session=session,
                                                    bill_id=amm_slg)
                    bill_documents[amm_text] = amm_link
                    amm_page = self.lxmlize(a.attrib["href"])
                    for tr in amm_page.xpath('//tr'):
                        tds = tr.xpath("./td")
                        if len(tds) > 1:
                            if "voting" in tds[0].text_content().lower():
                                self.find_vote(tds, vote_documents,
                                               "Amendment: ")

            elif title_text.startswith('engrossed version'):
                if tds[1].text_content().strip():
                    engrossment_base = "http://legis.delaware.gov/LIS/lis{session}.nsf/EngrossmentsforLookup/{bill_id}/$file/Engross.html?open"
                    engrossment_link = engrossment_base.format(
                        session=session, bill_id="+".join(bill_id.split()))
                    if bill_url not in bill_documents.values():
                        bill_documents["Engrossed Version"] = engrossment_link

            elif title_text.startswith('substituted'):
                content = tds[1].text_content().strip()
                if ("Substitute" in content and not "Original" in content):
                    sub_link = tds[1].xpath(".//a/@href")[0]

            elif ("full text" in title_text
                  and ("(" not in title_text or "html" in title_text)):
                if tds[1].text_content().strip():
                    #it is totally unclear which version of the bill is referred to here
                    #so I'm just calling it "bill text"
                    bill_url = text_base_url.format(session=session,
                                                    bill_id=bill_id.replace(
                                                        " ", "+"))
                    if bill_url not in bill_documents.values():
                        bill_documents["Bill Text"] = bill_url

            elif title_text.startswith('fiscal notes'):
                pass
                #skipping fiscal notes for now, they are really ugly
                #but leaving in as a placeholder so we can remember to
                #do this someday, if we feel like it

            elif title_text.startswith('committee reports'):
                pass
                #the committee reports let a legislator
                #comment on a bill. They can comment as
                #"favorable","unfavorable" or "on its merits"
                #but these are NOT votes (per conversation w
                #seceretary of the DE senate 3/16/15). The bill is
                #considered if the majority sign it, which will
                #appear in the bill's action history as being
                #reported out of committee

            elif title_text.startswith('voting'):
                self.find_vote(tds, vote_documents)

            elif title_text.startswith('actions history'):
                action_list = tds[1].text_content().split("\n")

        sub_versions = []
        use_sub = False
        if sub_link:
            bill = self.scrape_bill(sub_link, chamber, session)
            if bill:
                sub_versions = [v["url"] for v in bill["versions"]]
                bill.add_title(bill_id)
                use_sub = True

        if not use_sub:
            bill = Bill(session, chamber, bill_id, bill_title)

            for s in primary_sponsors:
                bill.add_sponsor("primary", s)

            for s in addl_sponsors:
                #it is not totally clear whether "additional sponsors"
                #are co or primary but primary is my best guess
                #based on the bill text, bc they're on the first
                #line with the primary sponsor
                bill.add_sponsor("primary", s)

            for s in cosponsors:
                bill.add_sponsor("cosponsor", s)

        for name, doc_link in bill_documents.items():
            if "Engrossment" in name or "Bill Text" in name:
                if doc_link not in sub_versions:
                    bill.add_version(name, doc_link, mimetype="text/html")
            else:
                pass
                bill.add_document(name, doc_link, mimetype="text/html")

        for a in action_list:
            if a.strip():
                date, action = a.split('-', 1)
                try:
                    date = datetime.strptime(date.strip(), '%b %d, %Y')
                except ValueError:
                    date = datetime.strptime(date.strip(),
                                             '%B %d, %Y')  # XXX: ugh.
                action = action.strip()
                actor = actions.get_actor(action, bill['chamber'])
                attrs = dict(actor=actor, action=action, date=date)
                attrs.update(**self.categorizer.categorize(action))
                attrs["action"] = " ".join(attrs["action"].split())
                bill.add_action(**attrs)

        for name, doc in vote_documents.items():
            vote_chamber = "lower" if "house" in name.lower() else "upper"
            try:
                self.head(doc)
            except scrapelib.HTTPError:
                self.logger.warning("could not access vote document")
                continue
            vote_page = self.lxmlize(doc)
            vote_info = vote_page.xpath(".//div[@id='page_content']/p")[-1]
            yes_votes = []
            no_votes = []
            other_votes = []
            lines = vote_info.text_content().split("\n")
            for line in lines:
                if line.strip().startswith("Date"):
                    date_str = " ".join(line.split()[1:4])
                    date = datetime.strptime(date_str, "%m/%d/%Y %I:%M %p")
                    passage_status = line.strip().split()[-1]

                    #we've never seen a vote with anything but "passed"
                    #so throw an error otherwise so we can figure it out
                    passed_statuses = ["Passed"]
                    failed_statuses = ["Defeated", "Rescinded"]
                    if passage_status not in passed_statuses + failed_statuses:
                        raise AssertionError(
                            "Unknown passage state {}".format(passage_status))
                    passed = passage_status in passed_statuses

                if line.strip().startswith("Vote Type"):
                    if "voice" in line.lower():
                        voice_vote = True
                    else:
                        voice_vote = False
                        yes_count = int(re.findall("Yes: (\d+)", line)[0])
                        no_count = int(re.findall("No: (\d+)", line)[0])
                        other_count = int(
                            re.findall("Not Voting: (\d+)", line)[0])
                        other_count += int(
                            re.findall("Absent: (\d+)", line)[0])
                        vote_tds = vote_page.xpath(".//table//td")
                        person_seen = False
                        for td in vote_tds:
                            if person_seen:
                                person_vote = td.text_content().strip()
                                if person_vote == "Y":
                                    yes_votes.append(person)
                                elif person_vote == "N":
                                    no_votes.append(person)
                                elif person_vote in ["NV", "A", "X", "C"]:
                                    other_votes.append(person)
                                else:
                                    raise AssertionError(
                                        "Unknown vote '{}'".format(
                                            person_vote))
                                person_seen = False
                            else:
                                person = td.text_content().strip()
                                if person:
                                    person_seen = True

            if voice_vote:
                vote = Vote(vote_chamber, date, "passage", passed, 0, 0, 0)
            else:
                vote = Vote(vote_chamber,
                            date,
                            "passage",
                            passed,
                            yes_count,
                            no_count,
                            other_count,
                            yes_votes=[],
                            no_votes=[],
                            other_votes=[])

                vote["yes_votes"] = yes_votes
                vote["no_votes"] = no_votes
                vote["other_votes"] = other_votes

            if (passed and vote["yes_count"] <= vote["no_count"]
                    and not voice_vote):
                raise AssertionError("Vote passed with more N than Y votes?")

            if not passed and vote["yes_count"] > vote["no_count"]:
                self.logger.warning("Vote did not pass but had a majority \
                        probably worth checking")

            if "Amendment" in name:
                vote["type"] = "amendment"
            else:
                vote["type"] = "passage"
            vote.add_source(doc)
            bill.add_vote(vote)

        bill.add_source(link)

        return bill
Exemple #41
0
    def parse_bill_xml(self, chamber, session, txt):
        root = lxml.etree.fromstring(txt)
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])
        bill_title = root.findtext("caption")

        if session[2] == 'R':
            session = session[0:2]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {
                'H': 'lower',
                'S': 'upper',
                'E': 'executive'
            }[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                continue

            if desc == 'Amended':
                atype = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment:introduced'
            elif desc == 'Amendment amended':
                atype = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment:withdrawn'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'bill:passed'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'bill:introduced'
                else:
                    atype = 'other'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'governor:received'
            elif desc.startswith('Signed by the Governor'):
                atype = 'governor:signed'
            elif desc == 'Read first time':
                atype = ['bill:introduced', 'bill:reading:1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = 'bill:passed'
            elif desc.startswith('Referred to') or desc.startswith(
                    "Recommended to be sent to "):
                atype = 'committee:referred'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee:passed'
            elif desc == "Filed":
                atype = 'bill:filed'
            else:
                atype = 'other'

            bill.add_action(actor,
                            action.findtext('description'),
                            act_date,
                            type=atype,
                            **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('author', author)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('coauthor', coauthor)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('sponsor', sponsor)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor', cosponsor)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        return bill
Exemple #42
0
    def scrape_actions(self, session, href):
        page = self.lxmlize(href)

        (bid, ) = page.xpath('//h1[@id="page-title"]/text()')
        bid = re.sub(r"^Bill Actions for ", "", bid)
        subjects = self.subjects.get(bid, [])

        # some pages say "Measure Number Breakdown", others "Bill..."
        table = page.xpath("//table[contains(@summary, 'Number Breakdown')]")
        table = table[0]
        ttrows = page.xpath("//div[@id='application']/p")
        descr = ttrows[-2]

        title = re.sub("\s+", " ", descr.text_content()).strip()
        ttrows = ttrows[:-1]

        chamber = {"H": "lower", "S": "upper"}[bid[0]]

        type_ = bid[1:3]
        bill_type = "bill"
        if type_.startswith("B"):
            bill_type = "bill"

        if type_.startswith("R"):
            bill_type = "resolution"

        if type_ == "CR":
            bill_type = "concurrent resolution"

        bill = Bill(session,
                    chamber,
                    bid,
                    title,
                    subjects=subjects,
                    type=bill_type)

        bill.add_source(href)

        for row in ttrows:
            if isinstance(row, lxml.html.HtmlComment):
                continue  # ignore HTML comments, no text_content()
            sponsors = row.text_content().strip()
            sinf = re.match(
                "(?i)introduced by( (rep\.|sen\.))? (?P<sponsors>.*)",
                sponsors)
            if sinf:
                sponsors = sinf.groupdict()
                for sponsor in [
                        x.strip() for x in sponsors['sponsors'].split(",")
                ]:
                    bill.add_sponsor('primary', sponsor)

        dt = None
        oldchamber = 'other'
        for row in table.xpath(".//tr"):
            if row.text_content().strip() == '':
                continue

            if "Meeting Description" in [
                    x.strip() for x in row.xpath(".//th/text()")
            ]:
                continue

            row = row.xpath("./*")
            row = [x.text_content().strip() for x in row]

            if len(row) > 3:
                row = row[:3]

            date, chamber, action = row

            try:
                chamber = {"House": "lower", "Senate": "upper"}[chamber]
                oldchamber = chamber
            except KeyError:
                chamber = oldchamber

            if date != '':
                dt = datetime.strptime("%s %s" % (date, self.year), "%m/%d %Y")

            kwargs = self.categorizer.categorize(action)

            bill.add_action(chamber, action, dt, **kwargs)

        version_url = page.xpath("//a[contains(text(), 'Versions')]")
        if len(version_url) == 1:
            href = version_url[0].attrib['href']
            bill = self.scrape_versions(bill, href)

        self.save_bill(bill)
Exemple #43
0
    def scrape_bill(self, chamber, session, bill_id, url):
        sidebar = lxml.html.fromstring(self.urlopen(url))

        try:
            hist_url = get_popup_url(
                sidebar.xpath("//a[contains(., 'Bill History')]")[0])
        except IndexError:
            # where is it?
            return

        page = lxml.html.fromstring(self.urlopen(hist_url))
        page.make_links_absolute(hist_url)

        title = page.xpath("string(//table[2]/tr[4])").strip()
        if title == '':
            self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url)
            return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if 'HR' in bill_id or 'SR' in bill_id:
            bill_type = ['resolution']
        elif 'HJR' in bill_id or 'SJR' in bill_id:
            bill_type = ['joint resolution']
        elif 'HCR' in bill_id or 'SCR' in bill_id:
            bill_type = ['concurrent resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(hist_url)

        # get pieces of version_link
        vpieces = sidebar.xpath('//a[contains(string(.), "HTML")]/@href')
        if vpieces:
            version_base, version_type, version_end = vpieces[0].rsplit('/', 2)
            versions = [
                o.strip()
                for o in sidebar.xpath("//select[@name='BVer']/option/text()")
            ]
            # if there are no options, put version_type in one
            if not versions:
                versions = [version_type]

            for version_name in versions:
                version_url = '/'.join(
                    (version_base, version_name, version_end))
                bill.add_version(version_name,
                                 version_url,
                                 mimetype='text/html')
        else:
            bill.add_version(
                'Introduced',
                sidebar.xpath('//a[contains(string(.), "PDF")]/@href')[0],
                mimetype='application/pdf')

        sponsors = page.xpath("string(//table[2]/tr[3])").strip()
        sponsor_re = r'[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)'
        for sponsor in re.findall(sponsor_re, sponsors):
            sponsor = sponsor.replace(' and', '').strip(' .,')

            # a few sponsors get mangled by our regex
            sponsor = {
                'Means': 'Ways & Means',
                'Iowa': 'Economic Growth/Rebuild Iowa',
                'Safety': 'Public Safety',
                'Resources': 'Human Resources',
                'Affairs': 'Veterans Affairs',
                'Protection': 'Environmental Protection',
                'Government': 'State Government',
                'Boef': 'De Boef'
            }.get(sponsor, sponsor)

            bill.add_sponsor('primary', sponsor)

        for tr in page.xpath("//table[3]/tr"):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[2])").strip()
            action = re.sub(r'\s+', ' ', action)

            # Capture any amendment links.
            version_urls = set(version['url'] for version in bill['versions'])
            if 'amendment' in action.lower():
                for anchor in tr.xpath('td[2]/a'):
                    if '-' in anchor.text:
                        url = anchor.attrib['href']
                        if url not in version_urls:
                            bill.add_version(anchor.text,
                                             url,
                                             mimetype='text/html')
                            version_urls.add(url)

            if 'S.J.' in action or 'SCS' in action:
                actor = 'upper'
            elif 'H.J.' in action or 'HCS' in action:
                actor = 'lower'
            else:
                actor = "other"

            action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip()

            if action.startswith('Introduced'):
                atype = ['bill:introduced']
                if ', referred to' in action:
                    atype.append('committee:referred')
            elif action.startswith('Read first time'):
                atype = 'bill:reading:1'
            elif action.startswith('Referred to'):
                atype = 'committee:referred'
            elif action.startswith('Sent to Governor'):
                atype = 'governor:received'
            elif action.startswith('Reported Signed by Governor'):
                atype = 'governor:signed'
            elif action.startswith('Signed by Governor'):
                atype = 'governor:signed'
            elif action.startswith('Vetoed by Governor'):
                atype = 'governor:vetoed'
            elif action.startswith('Item veto'):
                atype = 'governor:vetoed:line-item'
            elif re.match(r'Passed (House|Senate)', action):
                atype = 'bill:passed'
            elif re.match(r'Amendment (S|H)-\d+ filed', action):
                atype = ['amendment:introduced']
                if ', adopted' in action:
                    atype.append('amendment:passed')
            elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted',
                          action):
                atype = 'amendment:passed'
            elif re.match('Amendment (S|N)-\d+ lost', action):
                atype = 'amendment:failed'
            elif action.startswith('Resolution filed'):
                atype = 'bill:introduced'
            elif action.startswith('Resolution adopted'):
                atype = 'bill:passed'
            elif (action.startswith('Committee report')
                  and action.endswith('passage.')):
                atype = 'committee:passed'
            elif action.startswith('Withdrawn'):
                atype = 'bill:withdrawn'
            else:
                atype = 'other'

            if action.strip() == "":
                continue

            bill.add_action(actor, action, date, type=atype)

        bill['subjects'] = self._subjects[bill_id]
        self.save_bill(bill)
Exemple #44
0
    def scrape_bill(self,
                    url,
                    kw,
                    re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'),
                    re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'),
                    re_digits=re.compile(r'\d{,5}'),
                    actions_get_actor=actions.get_actor):

        bill = Bill(**kw)
        bill.add_source(url)

        #---------------------------------------------------------------------
        # A few helpers.
        _url_2_lxml = self._url_2_lxml
        _cleanup_sponsors = self._cleanup_sponsors

        # Shortcut function partial to get text at a particular xpath:
        doc = _url_2_lxml(url)
        _get_text = partial(get_text, doc, 0)

        # Get session number--needed for fetching related documents (see below).
        xpath = '//font[contains(., "General Assembly") and @face="Arial"]'
        session_num = doc.xpath(xpath)[0].text_content()
        session_num = re_digits.match(session_num).group()

        #---------------------------------------------------------------------
        # Sponsors
        chamber = bill['chamber']

        sponsor_types = {
            'Additional Sponsor(s):': 'cosponsor',
            'CoSponsors:': 'cosponsor',
            'Primary Sponsor:': 'primary'
        }

        xpath = '//font[contains(., "Sponsor") and @color="#008080"]'
        headings = doc.xpath(xpath + '/text()')
        sponsors = doc.xpath(xpath +
                             '/../../following-sibling::td/font/text()')

        for h, s in zip(headings, sponsors):

            names = _cleanup_sponsors(s, chamber)
            type_ = sponsor_types[h.strip()]

            if names:
                for name, _chamber in names:
                    bill.add_sponsor(type_, name, chamber=_chamber)

        #---------------------------------------------------------------------
        # Versions

        tmp = '/'.join([
            'http://www.legis.delaware.gov',
            'LIS/lis{session_num}.nsf/vwLegislation',
            '{moniker}/$file/{filename}{format_}?open'
        ])

        documents = self.scrape_documents(source=url,
                                          docname="introduced",
                                          filename="Legis",
                                          tmp=tmp,
                                          session_num=session_num)

        for d in documents:
            bill.add_version(**d)

        # If bill is a substitution, add the original as a version.
        names = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]/text()')
        urls = doc.xpath('//*[contains(text(), "Substituted '
                         'Legislation for Bill:")]'
                         '/following-sibling::a/@href')

        for name, url in zip(names, urls):

            name = re_substitution.match(name).group(1)
            bill.add_version(name, url, description='original bill')

        #---------------------------------------------------------------------
        # Actions
        actions = doc.xpath('//font[contains(., "Actions History")]'
                            '/../following-sibling::table/descendant::td[2]')
        actions = actions[0].text_content()
        actions = filter(None, actions.splitlines())

        for a in reversed(actions):
            date, action = a.split(' - ', 1)
            try:
                date = datetime.strptime(date, '%b %d, %Y')
            except ValueError:
                date = datetime.strptime(date, '%B %d, %Y')  # XXX: ugh.

            actor = actions_get_actor(action, bill['chamber'])
            attrs = dict(actor=actor, action=action, date=date)
            attrs.update(**self.categorizer.categorize(action))
            bill.add_action(**attrs)

        #---------------------------------------------------------------------
        # Votes
        xpaths = [
            '//*[contains(text(), "vote:")]/following-sibling::a/@href',
            '//font[contains(., "vote:")]/a/@href'
        ]
        for xpath in xpaths:
            vote_urls = doc.xpath(xpath)
            if vote_urls:
                break

        for url in vote_urls:
            vote = self.scrape_vote(url)
            if vote:
                bill.add_vote(vote)

        #---------------------------------------------------------------------
        # Amendments
        xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a")

        tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/'
               'vwLegislation/{id_}/$file/{filename}{format_}?open')

        for source, id_ in zip(doc.xpath(xpath + '/@href'),
                               doc.xpath(xpath + '/text()')):

            match = re_amendment.match(id_)
            if match is None:
                match = re.search('/?([A-Z]A \\d{1,3}) to', id_)
            short_id = match.group(1)

            documents = self.scrape_documents(source=source,
                                              docname='amendment (%s)' %
                                              short_id,
                                              filename='Legis',
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=id_)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Add any related "Engrossments".
        # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for
        # an explanation of the engrossment process in DE.
        source = doc.xpath('//img[@alt="Engrossment"]/../@href')

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/EngrossmentsforLookup',
                '{moniker}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Engrossment",
                                              filename="Engross",
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=bill['bill_id'])

            for d in documents:
                bill.add_version(**d)

        # --------------------------------------------------------------------
        # Add any fiscal notes.
        source = doc.xpath("//img[@alt='Fiscal Note']/../@href")

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/FiscalforLookup',
                '{docnum}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Fiscal Note",
                                              filename="Fiscal",
                                              tmp=tmp,
                                              session_num=session_num)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Extra fields

        # Helper to get the first td sibling of certain nodes.
        tmp = '//font[contains(., "%s")]/../../../td[2]'
        first_sibling_text = lambda heading: _get_text(tmp % heading)

        extra_fields = {
            # A long description of the legislation.
            "summary": "Synopsis",
            # Codification details for enacted legislation.
            "volume_chapter": "Volume Chapter",
            # Presumably the date of approval/veto.
            "date_governor_acted": "Date Governor Acted",
            "fiscal_notes": "Fiscal Notes",
        }

        for key, name in extra_fields.iteritems():
            try:
                bill[key] = first_sibling_text(name)
            except IndexError:
                # xpath lookup failed.
                pass

        if bill['title'].strip() == "":
            if bill['bill_id'] != "HB 130" and bill['session'] != '147':
                raise Exception("bill title is empty")
            bill['title'] = bill['summary']
            # This added to help hack around the page that's missing
            # the bill title.

        self.save_bill(bill)
Exemple #45
0
    def bill_info(self, bill_link, session, main_url, bill_page):

        bill_page = lxml.html.fromstring(bill_page)

        #basic info
        try:
            long_title = bill_page.xpath(
                '//div[@id="content_text"]/h2')[0].text.split()
        except IndexError:
            return None
        bill_id = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        if not title:
            self.error('no title, skipping %s', bill_id)
            return

        #bill_type
        bill_type = 'resolution' if 'LR' in bill_id else 'bill'

        bill = Bill(session, 'upper', bill_id, title, type=bill_type)

        #sources
        bill.add_source(main_url)
        bill.add_source(bill_link)

        #Sponsor
        introduced_by = bill_page.xpath(
            '//div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text
        bill.add_sponsor('primary', introduced_by)

        #actions
        for actions in bill_page.xpath(
                '//div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'):
            date = actions[0].text
            if 'Date' not in date:
                date = datetime.strptime(date, '%b %d, %Y')
                action = actions[1].text_content()

                if 'Governor' in action:
                    actor = 'Governor'
                elif 'Speaker' in action:
                    actor = 'Speaker'
                else:
                    actor = 'upper'

                action_type = self.action_types(action)
                bill.add_action(actor, action, date, action_type)

        # were in reverse chronological order
        bill['actions'].reverse()

        #versions
        for versions in bill_page.xpath(
                '//div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'):
            version_url = versions.attrib['href']
            version_url = 'http://nebraskalegislature.gov/' + version_url[
                3:len(version_url)]
            version_name = versions.text
            # replace Current w/ session number
            version_url = version_url.replace('Current', session)
            bill.add_version(version_name,
                             version_url,
                             mimetype='application/pdf')

        #documents
        # this appear to be same as versions, dropped for now
        #for additional_info in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td/a'):
        #    document_name = additional_info.text
        #    document_url = additional_info.attrib['href']
        #    document_url = 'http://nebraskalegislature.gov/' + document_url[3:len(document_url)]
        #    if '.pdf' in document_url:
        #        bill.add_document(document_name, document_url)

        #amendments
        for admendments in bill_page.xpath(
                '//div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a'
        ):
            admendment_name = admendments.text
            admendment_url = admendments.attrib['href']
            admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[
                3:len(admendment_url)]
            bill.add_document(admendment_name, admendment_url)

        #related transcripts
        for transcripts in bill_page.xpath(
                '//div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'):
            transcript_name = transcripts.text
            transcript_url = transcripts.attrib['href']
            bill.add_document(transcript_name, transcript_url)

        self.save_bill(bill)
Exemple #46
0
    def scrape_bill(self,
                    session,
                    chamber,
                    bill_id,
                    title,
                    url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        xpath = ('//strong[contains(., "SUBJECT")]/../'
                 'following-sibling::td/a/text()')
        bill['subjects'] = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version(**version)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        #bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values.get('LEAD SPONSOR:', ''))
        if primary:
            bill.add_sponsor('primary', primary)

        # Add cosponsors.
        if values.get('SPONSORS:'):
            sponsors = strip_sponsors('', values['SPONSORS:'])
            sponsors = re.split(', (?![A-Z]\.)', sponsors)
            for name in sponsors:
                name = name.strip(', \n\r')
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search('(.+?), ([DM]\. Hall)', name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsor('cosponsor', name)
                    else:
                        bill.add_sponsor('cosponsor', name)

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            self.scrape_house_vote(bill, link.attrib['href'])

        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith('passed senate'):
                for href in tds[1].xpath('a/@href'):
                    self.scrape_senate_vote(bill, href, date)

            attrs = dict(actor=chamber, action=action, date=date)
            attrs.update(self.categorizer.categorize(action))
            bill.add_action(**attrs)

        self.save_bill(bill)
Exemple #47
0
    def scrape(self, session, chambers):
        
        api_base_url = "https://api.iga.in.gov"
        proxy = {"url":"http://in.proxy.openstates.org"}
        #ah, indiana. it's really, really hard to find
        #pdfs in their web interface. Super easy with
        #the api, but a key needs to be passed
        #in the headers. To make these documents
        #viewable to the public and our scrapers,
        #sunlight's put up a proxy service at this link
        #using our api key for pdf document access.


        client = ApiClient(self)
        r = client.get("bills",session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            for idx,char in enumerate(bill_id):
                try:
                    int(char)
                except ValueError:
                    continue
                disp_bill_id = bill_id[:idx]+" "+str(int(bill_id[idx:]))
                break


            bill_link = b["link"]
            api_source = api_base_url + bill_link
            bill_json = client.get("bill",session=session,bill_id=bill_id.lower())
            
            title = bill_json["title"]
            if title == "NoneNone":
                title = None
            #sometimes title is blank
            #if that's the case, we can check to see if
            #the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            #and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning("Bill is missing a title, using bill id instead.")


            original_chamber = "lower" if bill_json["originChamber"].lower() == "house" else "upper"
            bill = Bill(session,original_chamber,disp_bill_id,title)
            
            bill.add_source(self.make_html_source(session,bill_id))
            bill.add_source(api_source)

            #sponsors
            positions = {"Representative":"lower","Senator":"upper"}
            for s in bill_json["authors"]:
                bill.add_sponsor("primary",
                    self.get_name(s),
                    chamber=positions[s["position_title"]],
                    official_type="author")

            for s in bill_json["coauthors"]:
                bill.add_sponsor("cosponsor",
                    self.get_name(s),
                    chamber=positions[s["position_title"]],
                    official_type="coauthor")

            for s in bill_json["sponsors"]:
                bill.add_sponsor("primary",
                    self.get_name(s),
                    chamber=positions[s["position_title"]],
                    official_type="sponsor")

            for s in bill_json["cosponsors"]:
                bill.add_sponsor("cosponsor",
                    self.get_name(s),
                    chamber=positions[s["position_title"]],
                    official_type="cosponsor")

            #actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link
            try:
                actions = client.get("bill_actions",session=session,bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items":[]}
            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]
                
                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                date = datetime.datetime.strptime(date,"%Y-%m-%dT%H:%M:%S")
                
                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("bill:reading:1")
                    reading = True

                if ("second reading" in d
                    or "reread second time" in d):
                    action_type.append("bill:reading:2")
                    reading = True

                if ("third reading" in d
                    or "reread third time" in d):
                    action_type.append("bill:reading:3")
                    if "passed" in d:
                        action_type.append("bill:passed")
                    if "failed" in d:
                        action_type.append("bill:failed")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("bill:passed")

                if ("referred" in d and "committee on" in d
                    or "reassigned" in d and "committee on" in d):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("committee:referred")


                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee:passed")
                    if "fail" in d:
                        action_type.append("committee:failed")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment:passed")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment:failed")
                    if "withdraw" in d:
                        action_type.append("amendment:withdrawn")


                if "signed by the governor" in d:
                    action_type.append("governor:signed")


                if ("not substituted for majority report" in d
                    or "returned to the house" in d
                    or "referred to the senate" in d
                    or "referred to the house" in d
                    or "technical corrections" in d
                    or "signed by the president" in d
                    or "signed by the speaker"
                    or "authored" in d
                    or "sponsor" in d
                    or "coauthor" in d
                    or ("rule" in d and "suspended" in d)
                    or "removed as author" in d
                    or ("added as" in d and "author" in d)
                    or "public law" in d):

                        if len(action_type) == 0:
                            action_type.append("other")

                if len(action_type) == 0:
                    #calling it other and moving on with a warning
                    self.logger.warning("Could not recognize an action in '{}'".format(action_desc))
                    action_type = ["other"]

                elif committee:
                    bill.add_action(action_chamber,action_desc,date,type=action_type,committees=committee)

                else:
                    bill.add_action(action_chamber,action_desc,date,type=action_type)



            #subjects
            subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
            bill["subjects"] = subjects

            
            #versions and votes
            for version in bill_json["versions"][::-1]:
                version_json = client.get("bill_version",
                                        session=session,
                                        bill_id=version["billName"],
                                        version_id=version["printVersionName"])

                self.deal_with_version(version_json,bill,proxy)



            self.save_bill(bill)
Exemple #48
0
    def bill_info(self, bill_link, session, main_url, bill_page):

        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(bill_link)

        #basic info
        try:
            long_title = bill_page.xpath('//div[@id="content_text"]/h2')[0].text.split()
        except IndexError:
            return None
        bill_id = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        if not title:
            self.error('no title, skipping %s', bill_id)
            return

        #bill_type
        bill_type = 'resolution' if 'LR' in bill_id else 'bill'

        bill = Bill(session, 'upper', bill_id, title, type = bill_type)

        #sources
        bill.add_source(main_url)
        bill.add_source(bill_link)

        #Sponsor
        introduced_by = bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text
        bill.add_sponsor('primary', introduced_by)

        #actions
        for actions in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'):
            date = actions[0].text
            if 'Date' not in date:
                date = datetime.strptime(date, '%b %d, %Y')
                action = actions[1].text_content()

                if 'Governor' in action:
                    actor = 'Governor'
                elif 'Speaker' in action:
                    actor = 'Speaker'
                else:
                    actor = 'upper'

                action_type = self.action_types(action)
                bill.add_action(actor, action, date, action_type)

        # were in reverse chronological order
        bill['actions'].reverse()

        #versions
        for version in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'):
            version_url = version.attrib['href']
            version_name = version.text
            # replace Current w/ session number
            version_url = version_url.replace('Current', session)
            bill.add_version(version_name, version_url,
                             mimetype='application/pdf')

        #amendments
        for amendment in bill_page.xpath('//h2[text()="Amendments"]/following-sibling::table[1]//a'):
            amendment_name = amendment.text
            amendment_url = amendment.attrib['href']
            bill.add_document(amendment_name, amendment_url)

        #related transcripts
        for transcripts in bill_page.xpath('//h2[text()="Related Transcripts"]/following-sibling::table[1]//a'):
            transcript_name = transcripts.text
            transcript_url = transcripts.attrib['href']
            bill.add_document(transcript_name, transcript_url)

        self.save_bill(bill)
Exemple #49
0
    def scrape_bill(self,
                    session,
                    chamber,
                    bill_id,
                    title,
                    url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.urlopen(url)

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version(**version)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        #bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values['LEAD SPONSOR:'])
        if primary:
            bill.add_sponsor('primary', primary)

        # Add cosponsors.
        sponsors = strip_sponsors('', values['SPONSORS:']).split('\r\n')
        for name in sponsors:
            name = name.strip(', ')
            if name:
                bill.add_sponsor('cosponsor', name)

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            self.scrape_vote(bill, link.attrib['href'])

        actor = chamber
        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()

            if (action == 'Communicated to Senate'
                    or action.startswith('Senate received')
                    or action.startswith('Ordered to Senate')):
                actor = 'upper'
            elif (action == 'Communicated to House'
                  or action.startswith('House received')
                  or action.startswith('Ordered to House')):
                actor = 'lower'

            if action == 'Read 1st time':
                atype = 'bill:reading:1'
            elif action == 'Read 2nd time':
                atype = 'bill:reading:2'
            elif action == 'Read 3rd time':
                atype = 'bill:reading:3'
            elif action == 'Filed for introduction':
                atype = 'bill:filed'
            elif action.startswith('To Governor') and 'Journal' not in action:
                atype = 'governor:received'
            elif re.match(r'To [A-Z]', action):
                atype = 'committee:referred'
            elif action.startswith('Introduced in'):
                atype = 'bill:introduced'
            elif (action.startswith('Approved by Governor')
                  and 'Journal' not in action):
                atype = 'governor:signed'
            elif (action.startswith('Passed Senate')
                  or action.startswith('Passed House')):
                atype = 'bill:passed'
            elif (action.startswith('Reported do pass')
                  or action.startswith('With amendment, do pass')):
                atype = 'committee:passed'
            else:
                atype = 'other'

            bill.add_action(actor, action, date, type=atype)

        self.save_bill(bill)
Exemple #50
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=btype)
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version(link.xpath('string()').strip(),
                             link.attrib['href'],
                             mimetype='text/html')

        sponsor_links = page.xpath("//td[contains(@id, 'tdSponsors')]/a")
        for link in sponsor_links:
            bill.add_sponsor("primary", link.text)

        actor = chamber
        use_row = False
        self.debug(bill_id)
        for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"):

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('bill:introduced')
                atypes.append('bill:reading:1')
            elif action.startswith('Signed by Governor'):
                atypes.append('governor:signed')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate', 'House of Representatives']:
                    first = 'bill'
                else:
                    first = 'committee'
                atypes.append("%s:%s" % (first, match.group(3).lower()))

            if 'referred to' in action.lower():
                atypes.append('committee:referred')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment:introduced')
                atypes.append('amendment:passed')

            if 'Veto override, Passed' in action:
                atypes.append('bill:veto_override:passed')
            elif 'Veto override, Failed' in action:
                atypes.append('bill:veto_override:failed')

            if 'Delivered to the Governor' in action:
                atypes.append('governor:received')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match('\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(actor, action, date, type=atypes)

        subjects = []
        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            subjects.append(link.text.strip())
        bill['subjects'] = subjects

        self.save_bill(bill)
Exemple #51
0
    def scrape_bill(self, chamber, session, bill_id, title, sponsor, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            bill.add_sponsor('introducer', sponsor)

            try:
                hist_table = page.xpath(
                    "//div[@id = 'tabBodyBillHistory']/table")[0]
                for tr in hist_table.xpath("tbody/tr"):
                    date = tr.xpath("string(td[1])")
                    date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

                    actor = tr.xpath("string(td[2])")
                    actor = {
                        'Senate': 'upper',
                        'House': 'lower'
                    }.get(actor, actor)

                    act_text = tr.xpath("string(td[3])").strip()
                    for action in act_text.split(u'\u2022'):
                        action = action.strip()
                        if not action:
                            continue

                        atype = []
                        if action.startswith('Referred to'):
                            atype.append('committee:referred')
                        elif action.startswith('Favorable by'):
                            atype.append('committee:passed')
                        elif action == "Filed":
                            atype.append("bill:filed")
                        elif action.startswith("Withdrawn"):
                            atype.append("bill:failed")

                        bill.add_action(actor, action, date, type=atype)
            except IndexError:
                self.log("No bill history for %s" % bill_id)

            try:
                version_table = page.xpath(
                    "//div[@id = 'tabBodyBillText']/table")[0]
                for tr in version_table.xpath("tbody/tr"):
                    name = tr.xpath("string(td[1])").strip()
                    url = tr.xpath("td/a[1]")[0].attrib['href']
                    bill.add_version(name, url)
            except IndexError:
                self.log("No version table for %s" % bill_id)

            try:
                analysis_table = page.xpath(
                    "//div[@id = 'tabBodyStaffAnalysis']/table")[0]
                for tr in analysis_table.xpath("tbody/tr"):
                    name = tr.xpath("string(td[1])").strip()
                    name += " -- " + tr.xpath("string(td[3])").strip()
                    date = tr.xpath("string(td[4])").strip()
                    if date:
                        name += " (%s)" % date
                    url = tr.xpath("td/a")[0].attrib['href']
                    bill.add_document(name, url)
            except IndexError:
                self.log("No analysis table for %s" % bill_id)

            self.save_bill(bill)
Exemple #52
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr):
        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)


        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            # Construct session for web query, going from '20092010' to '0910'
            source_session = session[2:4] + session[6:8]

            # Turn 'AB 10' into 'ab_10'
            source_num = "%s_%s" % (bill.measure_type.lower(),
                                    bill.measure_num)

            # Construct a fake source url
            source_url = ("http://www.leginfo.ca.gov/cgi-bin/postquery?"
                          "bill_number=%s&sess=%s" %
                          (source_num, source_session))

            fsbill.add_source(source_url)

            scraped_versions = self.scrape_site_versions(bill, source_url)

            title = ''
            short_title = ''
            type = ['bill']
            subject = ''
            all_titles = set()
            i = 0
            for version in bill.versions:
                if not version.bill_xml:
                    continue

                title = clean_title(version.title)
                all_titles.add(title)
                short_title = clean_title(version.short_title)
                type = [bill_type]

                if version.appropriation == 'Yes':
                    type.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type.append('fiscal committee')
                if version.local_program == 'Yes':
                    type.append('local program')
                if version.urgency == 'Yes':
                    type.append('urgency')
                if version.taxlevy == 'Yes':
                    type.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

                date = version.bill_version_action_date.date()

                url = ''
                try:
                    scraped_version = scraped_versions[i]
                    if scraped_version[0] == date:
                        url = scraped_version[1]
                        i += 1
                except IndexError:
                    pass

                fsbill.add_version(
                    version.bill_version_id, url,
                    date=date,
                    title=title,
                    short_title=short_title,
                    subject=[subject],
                    type=type)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['short_title'] = short_title
            fsbill['type'] = type
            fsbill['subjects'] = [subject]

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            fsbill['alternate_titles'] = list(all_titles)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            introduced = False

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    actor = re.sub('^Assembly', 'lower', actor)
                    actor = re.sub('^Senate', 'upper', actor)

                type = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                if act_str.startswith('Introduced'):
                    introduced = True
                    type.append('bill:introduced')

                if 'Read first time.' in act_str:
                    if not introduced:
                        type.append('bill:introduced')
                        introduced = True
                    type.append('bill:reading:1')

                if 'To Com' in act_str or 'referred to' in act_str.lower():
                    type.append('committee:referred')

                if 'Read third time.  Passed.' in act_str:
                    type.append('bill:passed')

                if 'Approved by Governor' in act_str:
                    type.append('governor:signed')

                if 'Item veto' in act_str:
                    type.append('governor:vetoed:line-item')

                if 'Vetoed by Governor' in act_str:
                    type.append('governor:vetoed')

                if 'To Governor' in act_str:
                    type.append('governor:received')

                if 'Read second time' in act_str:
                    type.append('bill:reading:2')

                if not type:
                    type = ['other']

                fsbill.add_action(actor, act_str, action.action_date.date(),
                                  type=type)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                motion = vote.motion.motion_text or ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              self._tz.localize(vote.vote_date_time),
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                # The abstain count field in CA's database includes
                # vacancies, which we aren't interested in.
                fsvote['other_count'] = len(fsvote['other_votes'])

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Exemple #53
0
    def scrape(self, chamber, session):
        if int(session) < 128:
            raise NoDataForPeriod(session)

        base_url = 'http://www.lsc.state.oh.us/status%s/' % session

        bill_types = {'lower': [('hb','bill'),
                                ('hjr','joint resolution'),
                                ('hcr','concurrent resolution')],
                      'upper': [('sb','bill'),
                                ('sjr','joint resolution'),
                                ('scr','concurrent resolution')]}

        for bill_prefix, bill_type in bill_types[chamber]:
            url = base_url + '%s.xlsx' % bill_prefix

            try:
                fname, resp = self.urlretrieve(url)
            except scrapelib.HTTPError:
                # if there haven't yet been any bills of a given type
                # then the excel url for that type will 404
                continue

            sh = xlrd.open_workbook(fname).sheet_by_index(0)

            # once workbook is open, we can remove tempfile
            os.remove(fname)

            for rownum in range(1, sh.nrows):
                bill_id = '%s %s' % (bill_prefix.upper(), rownum)
                bill_title = str(sh.cell(rownum, 3).value)
                bill = Bill(session, chamber, bill_id, bill_title,
                            type=bill_type)
                bill.add_source(url)
                bill.add_sponsor('primary', str(sh.cell(rownum, 1).value))

                # add cosponsor
                if sh.cell(rownum, 2).value:
                    bill.add_sponsor('cosponsor',
                                     str(sh.cell(rownum, 2).value))

                actor = ""

                # Actions start column after bill title
                for colnum in range(4, sh.ncols - 1):
                    action = str(sh.cell(0, colnum).value)
                    cell = sh.cell(rownum, colnum)
                    date = cell.value

                    if len(action) != 0:
                        if action.split()[0] == 'House':
                            actor = "lower"
                        elif action.split()[0] == 'Senate':
                            actor = "upper"
                        elif action.split()[-1] == 'Governor':
                            actor = "executive"
                        elif action.split()[0] == 'Gov.':
                            actor = "executive"
                        elif action.split()[-1] == 'Gov.':
                            actor = "executive"

                    if action in ('House Intro. Date', 'Senate Intro. Date'):
                        atype = ['bill:introduced']
                        action = action.replace('Intro. Date', 'Introduced')
                    elif action == '3rd Consideration':
                        atype = ['bill:reading:3', 'bill:passed']
                    elif action == 'Sent to Gov.':
                        atype = ['governor:received']
                    elif action == 'Signed By Governor':
                        atype = ['governor:signed']
                    else:
                        atype = ['other']

                    if type(date) == float:
                        date = str(xlrd.xldate_as_tuple(date, 0))
                        date = datetime.datetime.strptime(
                            date, "(%Y, %m, %d, %H, %M, %S)")
                        bill.add_action(actor, action, date, type=atype)

                self.scrape_votes(bill, bill_prefix, rownum, session)
                self.scrape_versions(bill, bill_prefix, rownum, session)
                self.save_bill(bill)
Exemple #54
0
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        page = self.urlopen(bill_detail_url)

        if 'INVALID BILL NUMBER' in page:
            self.warning('INVALID BILL %s' % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath('span/text()')[0]

        if 'General Bill' in bill_type:
            bill_type = 'bill'
        elif 'Concurrent Resolution' in bill_type:
            bill_type = 'concurrent resolution'
        elif 'Joint Resolution' in bill_type:
            bill_type = 'joint resolution'
        elif 'Resolution' in bill_type:
            bill_type = 'resolution'
        else:
            raise ValueError('unknown bill type: %s' % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(session, chamber, bill_id, bill_summary, type=bill_type)
        bill['subjects'] = list(self._subjects[bill_id])

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsor('primary', sponsor)
        for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ').strip()
            bill.add_sponsor('primary', sponsor)

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.urlopen(version_url)
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance
            bill.add_version(version.text, version.get('href'),
                             on_duplicate='use_old',
                             mimetype='text/html')

        # actions
        for row in bill_div.xpath('table/tr'):
            date_td, chamber_td, action_td = row.xpath('td')

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {'Senate':'upper',
                              'House':'lower',
                              None: 'other'}[chamber_td.text]

            action = action_td.text_content()
            action = action.split('(House Journal')[0]
            action = action.split('(Senate Journal')[0].strip()

            atype = action_type(action)
            bill.add_action(action_chamber, action, date, atype)


        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        self.save_bill(bill)
Exemple #55
0
    def scrape_pre_2009_bill(self, chamber, session, bill_id, short_title=''):
        """bills from 2008 and below are in a 'pre' element and is simpler to
        parse them as text"""
        url = 'http://legislature.idaho.gov/legislation/%s/%s.html' % (
            session, bill_id.replace(' ', ''))
        bill_page = self.urlopen(url)
        html = lxml.html.fromstring(bill_page)
        text = html.xpath('//pre')[0].text.split('\r\n')

        # title
        title = " - ".join(
            [x.strip() for x in text[1].split('-') if x.isupper()])
        # bill type
        bill_type = get_bill_type(bill_id)

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        # sponsors
        sponsors = text[0].split('by')[-1]
        for sponsor in sponsors.split(','):
            bill.add_sponsor('primary', sponsor)

        actor = chamber
        self.flag()  # clear last bills vote flags
        self.vote = None  #

        for line in text:

            if re.match(r'^\d\d/\d\d', line):
                date = date = datetime.datetime.strptime(
                    line[0:5] + '/' + session[0:4], "%m/%d/%Y")
                self.last_date = date
                action_text = line[5:].strip()
                # actor
                if action_text.lower().startswith('house') or \
                   action_text.lower().startswith('senate'):
                    actor = {'H': 'lower', 'S': 'upper'}[action_text[0]]

                action = get_action(actor, action_text)
                bill.add_action(actor, action_text, date, type=action)
                if "bill:passed" in action or "bill:failed" in action:
                    passed = False if 'FAILED' in action_text else True
                    votes = re.search(r'(\d+)-(\d+)-(\d+)', action_text)
                    if votes:
                        yes, no, other = votes.groups()
                        self.in_vote = True
                        self.vote = Vote(chamber, date, action_text, passed,
                                         int(yes), int(no), int(other))
            else:
                date = self.last_date
                # nothing to do if its not a vote
                if "Floor Sponsor" in line:
                    self.in_vote = False
                    if self.vote:
                        bill.add_vote(self.vote)
                        self.vote = None

                if not self.in_vote:
                    continue
                if 'AYES --' in line:
                    self.flag(ayes=True)
                elif 'NAYS --' in line:
                    self.flag(nays=True)
                elif 'Absent and excused' in line:
                    self.flag(other=True)

                if self.ayes:
                    for name in line.replace('AYES --', '').split(','):
                        name = name.strip()
                        if name:
                            self.vote.yes(name)

                if self.nays:
                    for name in line.replace('NAYS --', '').split(','):
                        name = name.strip()
                        if name:
                            self.vote.no(name)

                if self.other:
                    for name in line.replace('Absent and excused --',
                                             '').split(','):
                        name = name.strip()
                        if name:
                            self.vote.other(name)

        self.save_bill(bill)
Exemple #56
0
    def scrape_bill(self,
                    session,
                    chamber,
                    bill_id,
                    title,
                    url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.urlopen(url)

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version(**version)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        #bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values['LEAD SPONSOR:'])
        if primary:
            bill.add_sponsor('primary', primary)

        # Add cosponsors.
        sponsors = strip_sponsors('', values['SPONSORS:']).split('\r\n')
        for name in sponsors:
            name = name.strip(', ')
            if name:
                bill.add_sponsor('cosponsor', name)

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            self.scrape_vote(bill, link.attrib['href'])

        actor = chamber
        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()

            attrs = dict(actor=actor, action=action, date=date)
            attrs.update(self.categorizer.categorize(action))
            bill.add_action(**attrs)

        self.save_bill(bill)
Exemple #57
0
    def scrape_bill(self, chamber, session, bill_id, url):
        sidebar = lxml.html.fromstring(self.urlopen(url))

        try:
            hist_url = get_popup_url(
                sidebar.xpath("//a[contains(., 'Bill History')]")[0])
        except IndexError:
            # where is it?
            return

        page = lxml.html.fromstring(self.urlopen(hist_url))
        page.make_links_absolute(hist_url)

        title = page.xpath("string(//table[2]/tr[4])").strip()

        if 'HR' in bill_id or 'SR' in bill_id:
            bill_type = ['resolution']
        elif 'HJR' in bill_id or 'SJR' in bill_id:
            bill_type = ['joint resolution']
        elif 'HCR' in bill_id or 'SCR' in bill_id:
            bill_type = ['concurrent resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(hist_url)

        for option in sidebar.xpath("//select[@name='BVer']/option"):
            version_name = option.text.strip()
            if option.get('selected'):
                version_url = re.sub(r'frm=2', 'frm=1', url)
            else:
                version_url = option.attrib['value']
            bill.add_version(version_name, version_url)

        if not bill['versions']:
            version_url = re.sub(r'frm=2', 'frm=3', url)
            bill.add_version('Introduced', version_url)

        sponsors = page.xpath("string(//table[2]/tr[3])").strip()
        sponsor_re = r'[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)'
        for sponsor in re.findall(sponsor_re, sponsors):
            sponsor = sponsor.replace(' and', '').strip(' .,')

            # a few sponsors get mangled by our regex
            sponsor = {
                'Means': 'Ways & Means',
                'Safety': 'Public Safety',
                'Resources': 'Human Resources',
                'Affairs': 'Veterans Affairs',
                'Protection': 'Environmental Protection',
                'Government': 'State Government',
                'Boef': 'De Boef'
            }.get(sponsor, sponsor)

            bill.add_sponsor('sponsor', sponsor)

        for tr in page.xpath("//table[3]/tr"):
            date = tr.xpath("string(td[1])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[2])").strip()
            action = re.sub(r'\s+', ' ', action)

            if 'S.J.' in action or 'SCS' in action:
                actor = 'upper'
            elif 'H.J.' in action or 'HCS' in action:
                actor = 'lower'

            action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip()

            if action.startswith('Introduced'):
                atype = ['bill:introduced']
                if ', referred to' in action:
                    atype.append('committee:referred')
            elif action.startswith('Read first time'):
                atype = 'bill:reading:1'
            elif action.startswith('Referred to'):
                atype = 'committee:referred'
            elif action.startswith('Sent to Governor'):
                atype = 'governor:received'
            elif action.startswith('Signed by Governor'):
                atype = 'governor:signed'
            elif action.startswith('Vetoed by Governor'):
                atype = 'governor:vetoed'
            elif action.startswith('Item veto'):
                atype = 'governor:vetoed:line-item'
            elif re.match(r'Passed (House|Senate)', action):
                atype = 'bill:passed'
            elif re.match(r'Amendment (S|H)-\d+ filed', action):
                atype = ['amendment:introduced']
                if ', adopted' in action:
                    atype.append('amendment:passed')
            elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted',
                          action):
                atype = 'amendment:passed'
            elif re.match('Amendment (S|N)-\d+ lost', action):
                atype = 'amendment:failed'
            elif action.startswith('Resolution filed'):
                atype = 'bill:introduced'
            elif action.startswith('Resolution adopted'):
                atype = 'bill:passed'
            elif (action.startswith('Committee report')
                  and action.endswith('passage.')):
                atype = 'committee:passed'
            elif action.startswith('Withdrawn'):
                atype = 'bill:withdrawn'
            else:
                atype = 'other'

            bill.add_action(actor, action, date, type=atype)

        bill['subjects'] = self._subjects[bill_id]
        self.save_bill(bill)
Exemple #58
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text.encode('ascii', 'ignore')
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if (bill_title is None or "Bill does not exist" in history_xml):
            self.warning("Bill does not appear to exist")
            return
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        bill.add_source(history_url)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version(name=self.NAME_SLUGS[version[1][-5]],
                             url=version[1],
                             mimetype='text/html')

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document(name="Analysis ({})".format(
                self.NAME_SLUGS[analysis[1][-5]]),
                              url=analysis[1],
                              mimetype='text/html')

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document(name="Fiscal Note ({})".format(
                self.NAME_SLUGS[fiscal_note[1][-5]]),
                              url=fiscal_note[1],
                              mimetype='text/html')

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document(name="Witness List ({})".format(
                self.NAME_SLUGS[witness[1][-5]]),
                              url=witness[1],
                              mimetype='text/html')

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {
                'H': 'lower',
                'S': 'upper',
                'E': 'executive'
            }[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment:introduced'
            elif desc == 'Amendment amended':
                atype = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment:withdrawn'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'bill:passed'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'bill:introduced'
                else:
                    atype = 'bill:filed'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'governor:received'
            elif desc.startswith('Signed by the Governor'):
                atype = 'governor:signed'
            elif desc == 'Vetoed by the Governor':
                atype = 'governor:vetoed'
            elif desc == 'Read first time':
                atype = ['bill:introduced', 'bill:reading:1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['bill:passed']
                if not introduced:
                    introduced = True
                    atype.append('bill:introduced')
            elif desc == "Passed as amended":
                atype = 'bill:passed'
            elif (desc.startswith('Referred to')
                  or desc.startswith("Recommended to be sent to ")):
                atype = 'committee:referred'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee:passed'
            elif desc == "Filed":
                atype = 'bill:filed'
            elif desc == 'Read 3rd time':
                atype = 'bill:reading:3'
            elif desc == 'Read 2nd time':
                atype = 'bill:reading:2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee:passed:favorable'
            else:
                atype = 'other'

            if 'committee:referred' in atype:
                repls = ['Referred to', "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                extra['committees'] = ctty

            bill.add_action(actor,
                            action.findtext('description'),
                            act_date,
                            type=atype,
                            **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('primary', author, official_type='author')
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('cosponsor',
                                 coauthor,
                                 official_type='coauthor')
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('primary', sponsor, official_type='sponsor')
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor',
                                 cosponsor,
                                 official_type='cosponsor')

        self.save_bill(bill)
Exemple #59
0
    def scrape(self, chamber, session):
        # check for abiword
        if os.system('which abiword') != 0:
            raise ScrapeError('abiword is required for KS scraping')

        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.urlopen(ksapi.url + 'bill_status/')
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(session, chamber, bill_id, title,
                        type=btype, status=bill_data['STATUS'])
            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE'] and
                bill_data['LONGTITLE'] != bill['title']):
                bill.add_title(bill_data['LONGTITLE'])

            for sponsor in bill_data['SPONSOR_NAMES']:
                stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                         else 'cosponsor')
                bill.add_sponsor(stype, sponsor)

            # history is backwards
            for event in reversed(bill_data['HISTORY']):

                actor = ('upper' if event['chamber'] == 'Senate'
                         else 'lower')

                date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning('unknown action code on %s: %s %s' %
                                 (bill_id, event['action_code'],
                                  event['status']))
                    atype = 'other'
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(actor, action, date, type=atype)

            try:
                self.scrape_html(bill)
            except scrapelib.HTTPError as e:
                self.warning('unable to fetch HTML for bill {0}'.format(
                    bill['bill_id']))
            self.save_bill(bill)
Exemple #60
0
    def scrape_bill_page(self, chamber, session, bill_url, bill_type):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(
            page,
            "//a[@id='ctl00_PageBody_LinkAuthor']/text()"
        )

        sbp = lambda x: self.scrape_bare_page(page.xpath(
            "//a[contains(text(), '%s')]" % (x))[0].attrib['href'])

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        title = page.xpath(
            "//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        actions = page.xpath(
            "//div[@id='ctl00_PageBody_PanelBillInfo']/"
            "/table[@style='font-size:small']/tr")

        bill_id = page.xpath(
            "//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]]
        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsor('primary', author)
        for author in authors:
            bill.add_sponsor('cosponsor', author)

        for digest in digests:
            bill.add_document(digest.text,
                              digest.attrib['href'],
                              mimetype="application/pdf")

        for version in versions:
            bill.add_version(version.text,
                             version.attrib['href'],
                             mimetype="application/pdf")

        flags = {
            "prefiled": ["bill:filed"],
            "referred to the committee": ["committee:referred"],
            "sent to the house": ['bill:passed'],
            "ordered to the senate": ['bill:passed'],
        }

        try:
            votes_link = page.xpath("//a[text() = 'Votes']")[0]
            self.scrape_votes(bill, votes_link.attrib['href'])
        except IndexError:
            # Some bills don't have any votes
            pass


        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            date += "/%s" % (session)  # Session is April --> June. Prefiles
            # look like they're in January at earliest.
            date = dt.datetime.strptime(date, "%m/%d/%Y")
            chamber = {"S": "upper", "H": "lower", "J": 'joint'}[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            if cat == []:
                cat = ["other"]
            bill.add_action(chamber, text, date, cat)

        self.save_bill(bill)