Beispiel #1
0
    def scrape(self, session, chambers):
        urlified_session_id = session.replace(':', '-')
        url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projets-loi-%s.html' % urlified_session_id
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        # scrape all the actions for this session
        actions = self.scrape_actions(urlified_session_id)

        for row in doc.xpath('//table[@id="tblListeProjetLoi"]/tbody/tr'):
            id_td, details_td = row.xpath('td')[:2]
            bill_id = clean_spaces(id_td.text_content())
            pdf_link = details_td.xpath('p[@class="lienAssocie"]//a')[0]
            bill_name = clean_spaces(pdf_link.text_content())
            pdf_url = pdf_link.xpath('@href')[0]
            detail_url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projet-loi-%s-%s.html' % (bill_id, urlified_session_id)
            bill = Bill(session, 'lower', bill_id, bill_name)
            bill.add_source(url)
            bill.add_source(detail_url)
            bill.add_source(pdf_url)
            # add actions
            for action in actions[bill_id]:
                bill.add_action('lower', action['name'], action['date'])
            # get sponsors
            self.scrape_details(bill, detail_url)
            self.save_bill(bill)
Beispiel #2
0
    def scrape(self, session, chambers):
        # Get the progress table.
        url = 'http://www.assembly.nl.ca/business/bills/ga47session1.htm'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        for tr in doc.xpath('//table[@class="bills"]/tr')[1:]:
            bill_id = clean_spaces(tr[0].text_content()).strip('*')
            if not bill_id:
                break # empty rows extend past actual list of bills
            if bill_id.endswith("."):
                bill_id = bill_id[:-1]

            title = clean_spaces(tr[1].text_content())
            chapter = tr[-1].text_content()

            bill = Bill(session, 'lower', bill_id, title, type='bill')

            if chapter:
                bill['chapter'] = chapter

            # FIXME need to do more work to figure out what
            # version the text *really* is
            td = tr[1]
            bill_url = td.xpath('a/@href')
            if bill_url:
                bill.add_version(url=bill_url.pop(), name='First Reading',
                    mimetype='text/html')

            # Actions and version urls.
            data = zip([
                'First Reading',
                'Second Reading',
                'Committee',
                'Amendments',
                'Third Reading',
                'Royal Assent',
                'Act'],
                tr[2:-1])

            for action, td in data:
                date_text = td.text_content()
                date = None
                fmt = r'%b. %d/%Y'
                try:
                    date = datetime.datetime.strptime(date_text, fmt)
                except ValueError:
                    continue
                else:
                    break
                if date is None:
                    continue

                attrs = dict(action=action, date=date, actor='lower')
                attrs.update(self.categorizer.categorize(action))
                bill.add_action(**attrs)

            bill.add_source(url)
            self.save_bill(bill)
Beispiel #3
0
    def scrape(self, session, chambers):
        url = 'http://www.ontla.on.ca/web/bills/bills_all.do?locale=en&parlSessionID=%s' % session
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table/tr'):
            id, title_td, sponsor = row.xpath('td')
            bill_id = id.text_content().strip()
            title = clean_spaces(title_td.text_content())
            # pull sponsor off different page
            bill = Bill(session, 'lower', bill_id, title)
            # skip to detail page
            detail_url = title_td.xpath('a/@href')[0] + "&detailPage=bills_detail_status"
            bill.add_source(url)
            bill.add_source(detail_url)

            # get actions & sponsors
            self.scrape_details(bill, detail_url)

            if not bill['versions']:
                self.warning('no versions detected via normal method, using '
                             'top-level page')
                bill.add_version('Original (current version)',
                                 title_td.xpath('a/@href')[0],
                                 mimetype='text/html')

            self.save_bill(bill)
Beispiel #4
0
    def scrape_details(self, bill, detail_url):
        data = self.urlopen(detail_url)
        doc = lxml.html.fromstring(data)

        # versions
        versions = doc.xpath('//option')
        # skip first option, is a placeholder
        for version in versions[1:]:
            v_name = clean_spaces(version.text_content())
            v_url = detail_url + '&BillStagePrintId=' + version.get('value')
            bill.add_version(v_name, v_url, mimetype='text/html',
                             on_duplicate='use_new')
            # can get PDF links as well by opening doc & looking for 'pdf'
            #version_doc = lxml.html.fromstring(self.urlopen(version_url))

        # sponsors
        for sp in doc.xpath('//span[@class="pSponsor"]/a'):
            bill.add_sponsor('primary', clean_spaces(sp.text_content()))
        for sp in doc.xpath('//span[@class="sSponsor"]/a'):
            bill.add_sponsor('cosponsor', clean_spaces(sp.text_content()))

        # actions
        for row in doc.xpath('//table//tr')[1:]:
            date, stage, activity, committee = row.xpath('td/text()')
            date = datetime.strptime(clean_spaces(date), "%B %d, %Y")
            stage = clean_spaces(stage)
            activity = clean_spaces(activity)
            committee = clean_spaces(committee)
            # action prefixed with stage if present
            action = '%s - %s' % (stage, activity) if stage else activity
            # default to lower, use committee if present
            actor = committee if committee else 'lower'
            bill.add_action(actor, action, date)
Beispiel #5
0
 def scrape_actions(self, session_id):
     """
     Scrapes all the actions for all the bills in a given session, and
     returns them as a dictionary keyed by bill ID.
     """
     actions_list_url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/rapport/projets-loi-%s.html' % session_id
     actions_doc = lxml.html.fromstring(self.urlopen(actions_list_url))
     # compile regular expressions for bill action dates
     long_date_pattern = re.compile('\d\d? \w+ \d\d\d\d')
     short_date_pattern = re.compile('\d\d\d\d-\d\d-\d\d')
     # Make a dictionary of actions for each bill number
     actions = dict()
     for td in actions_doc.xpath('//table[@id="tblListeProjetLoi"]/tbody/tr/td'):
         bill_number = td.xpath('div/div/div')[0].text_content()
         bill_number = clean_spaces(bill_number.strip(u'N\xb0'))
         actions[bill_number] = []
         for action_row in td.xpath('div/div/table//tr'):
             action_name_td, action_date_td = action_row.xpath('td')
             action_name = action_name_td.text_content().strip(': ')
             action_date = clean_spaces(action_date_td.text_content())
             # Parse date using regexp
             # Need to set locale to french since the dates are in French
             locale.setlocale(locale.LC_ALL, 'fr_CA.utf8')
             try:
                 action_date = long_date_pattern.search(action_date).group(0)
                 action_date = datetime.strptime(action_date, '%d %B %Y')
             except AttributeError:
                 try:
                     action_date = short_date_pattern.search(action_date).group(0)
                     action_date = datetime.strptime(action_date, '%Y-%m-%d')
                 except:
                     # Can't parse the date, so giving up
                     continue
             actions[bill_number].append({
                 'name': action_name,
                 'date': action_date,
             })
     return actions
Beispiel #6
0
    def scrape_details(self, bill, detail_url):
        data = self.urlopen(detail_url)
        doc = lxml.html.fromstring(data)

        # Collect all the h3s together in a dict
        headings = dict()
        for heading in doc.xpath('//h3'):
            title = clean_spaces(heading.text_content())
            if len(title) > 0:
                headings[title] = heading

        # sponsors
        # TODO: is it possible for there to be more than one sponsor?
        if 'Auteur' in headings:
            sponsor = headings['Auteur'].xpath('following-sibling::*//a')[0].text_content().strip()
            bill.add_sponsor('primary', sponsor)
Beispiel #7
0
    def scrape_legislator(self, data, url, term):
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # Full name.
        full_name = doc.xpath('//b[starts-with(., "MLA:")]/text()').pop()
        if ':' in full_name:
            _, full_name = full_name.split(':')
        full_name.strip('Hon. ')
        full_name = clean_spaces(full_name)

        # Offices
        for xpath in [('//b[starts-with(., "MLA:")]/../'
                             'following-sibling::p/b/i/text()'),
                      ('//b[starts-with(., "MLA:")]/../'
                             'following-sibling::p/em/b/text()'),
                      ('//b[starts-with(., "MLA:")]/../'
                             'following-sibling::p/em/strong/text()'),
                      ('//b[starts-with(., "MLA:")]/../'
                             'following-sibling::p/strong/em/text()')]:
            district = doc.xpath(xpath)
            if district:
                district = clean_spaces(district.pop())
                break

        for xpath in [('//b[starts-with(., "MLA:")]/../'
                          'following-sibling::p/b/text()'),
                      ('//b[starts-with(., "MLA:")]/../'
                          'following-sibling::p/strong/text()')]:
            party = doc.xpath(xpath)
            if party:
                party = clean_spaces(party.pop()).title()
                break

        email = doc.xpath('//a[starts-with(@href, "mailto:")]/text()').pop()

        xpath = '//p[starts-with(., "Phone:")]/../following-sibling::td[1]'
        phone = [p.text_content() for p in doc.xpath(xpath)]
        if len(phone) == 1:
            phone.append(doc.xpath('//p[starts-with(., "Phone:")]')[-1][0].tail)

        xpath = '//p[starts-with(., "Fax:")]/../following-sibling::td[1]'
        fax = [p.text_content() for p in doc.xpath(xpath)]
        if len(fax) == 1:
            fax.append(doc.xpath('//p[starts-with(., "Fax:")]')[-1][0].tail)

        xpath = '//p[starts-with(., "Toll free:")]/../following-sibling::td[1]'
        toll_free = [p.text_content() for p in doc.xpath(xpath)]

        leg = Legislator(term=term, full_name=full_name, email=email,
            district=district, party=party, chamber='lower', **data)
        leg['toll_free_phone'] = toll_free
        leg['url'] = url

        # Constituencies
        for dist_office in doc.xpath(
            '//b[contains(., "Constituency:")]'):
            dist_office = dist_office.getparent().getparent().text_content()
            _, dist_office = dist_office.split(':')
            dist_office = dist_office.strip()
            leg.add_office('district', 'Constituency Office',
                address=dist_office,
                phone=phone.pop(), fax=fax.pop())

        # Capitol
        xpath = '//*[starts-with(., "Office:")]/../../text()'
        capitol_address = doc.xpath(xpath)
        capitol_address = '\n'.join(s.strip() for s in capitol_address)
        capitol_address = capitol_address.strip()
        leg.add_office('capitol', 'Office', address=capitol_address,
            phone=phone.pop(), fax=fax.pop())

        leg.add_source(url, page="legislator detail page")
        return leg
Beispiel #8
0
    def scrape(self, session, chambers):
        # Get the progress table.
        url = 'http://www.leg.bc.ca/%s/votes/progress-of-bills.htm' % session
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)
        session_start = self.metadata['session_details'][session]['start_date']
        session_end = self.metadata['session_details'][session]['end_date']

        for tr in doc.xpath('//table[@class="votestable"]/tr')[1:]:
            bill_id = clean_spaces(tr[0].text_content()).strip('*')
            if 'Ruled out of order' in bill_id:
                continue
            title = clean_spaces(tr[1].text_content())
            if title == 'Title':
                # This is a header row.
                continue
            sponsor = clean_spaces(tr[2].text_content())
            chapter = tr[-1].text_content()

            bill = Bill(session, 'lower', bill_id, title, type='bill')
            bill.add_sponsor(name=sponsor, type='primary')

            if chapter:
                bill['chapter'] = chapter

            # Actions and version urls.
            data = zip([
                'Reading',
                'Second Reading',
                'Committee',
                'Report',
                'Amended',
                'Third Reading',
                'Royal Assent',
                'S.B.C. Chap. No.'],
                tr[3:-1])

            for action, td in data:
                version_url = td.xpath('a/@href')
                if version_url:
                    bill.add_version(url=version_url.pop(), name=action,
                        mimetype='text/html')

                date_text = td.text_content()
                date = None
                for fmt in [r'%b %d', r'%b. %d']:
                    try:
                        date = datetime.datetime.strptime(date_text, fmt)
                    except ValueError:
                        continue
                    else:
                        break
                if date is None:
                    continue

                # guess the year of the action
                date = datetime.datetime(month=date.month, day=date.day,
                                         year=session_start.year)
                if date < session_start or date > session_end:
                    date = datetime.datetime(month=date.month, day=date.day,
                                             year=session_end.year)
                if date < session_start or date > session_end:
                    self.error('action %s appears to have occured on %s, '
                               'which is outside of session', action, date)
                # XXX: it should be noted that this isn't perfect
                # if a session is longer than a year there's a chance we get
                # the action date wrong (with a preference for the earliest
                # year)
                # in practice this doesn't seem to happen, and hopefully
                # if/when it does they will add years to these action dates


                attrs = dict(action=action, date=date, actor='lower')
                attrs.update(self.categorizer.categorize(action))
                bill.add_action(**attrs)

            bill.add_source(url)
            self.save_bill(bill)