Esempio n. 1
0
    def get_bill_information(self, bill_id, chamber, session):
        with self.urlopen(BILL_INFO_URL, 'POST', body="hListBills=" + bill_id) as bill_info_page:
            self.log("Got bill info")
            page = lxml.html.fromstring(bill_info_page)

            # TODO: check whether page is error page and raise custom exception defined above

            bs = page.xpath('//div/b')
            for b in bs:
                containing_div = b.getparent()
                if b.text == "BY":
                    l = containing_div.text_content().strip(u'BY\xa0').split(',')
                    sponsors = map(lambda x: x.strip(' '), l)
                if b.text.strip(u',\xa0') == "ENTITLED":
                    title = containing_div.text_content().lstrip(u'ENTITLED,\xa0')

            divs = page.xpath('//div')
            bill_type = ""
            for div in divs:
                text = div.text_content()
                for ind, reg in enumerate(self.type_regs):
                    if reg.match(text):
                        bill_type = self.bill_types[ind]

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            for ind, sponsor in enumerate(sponsors):
                if ind == 0:
                    bill.add_sponsor('primary', sponsor)
                else:
                    bill.add_sponsor('cosponsor', sponsor)
        return bill
Esempio n. 2
0
    def parse_senate_billpage(self, bill_url, year):
        with self.urlopen(bill_url) as bill_page:
            bill_page = BeautifulSoup(bill_page)
            # get all the info needed to record the bill
            bill_id = bill_page.find(id="lblBillNum").b.font.contents[0]
            bill_title = bill_page.find(id="lblBillTitle").font.string
            bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0]
            bill_lr = bill_page.find(id="lblLRNum").font.string

            bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url,
                        bill_lr=bill_lr, official_title=bill_title)
            bill.add_source(bill_url)

            # Get the primary sponsor
            bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0]
            bill_sponsor_link = bill_page.find(id="hlSponsor").href
            bill.add_sponsor('primary', bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # cosponsors show up on their own page, if they exist
            cosponsor_tag = bill_page.find(id="hlCoSponsors")
            if cosponsor_tag and 'href' in cosponsor_tag:
                self.parse_senate_cosponsors(bill, cosponsor_tag['href'])

            # get the actions
            action_url = bill_page.find(id="hlAllActions")['href']
            self.parse_senate_actions(bill, action_url)

            # stored on a separate page
            versions_url = bill_page.find(id="hlFullBillText")
            if versions_url:
                self.parse_senate_bill_versions(bill, versions_url['href'])

        self.save_bill(bill)
Esempio n. 3
0
    def scrape_bill_info(self, chamber, session):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        page = self.urlopen(info_url)
        page = csv.DictReader(StringIO.StringIO(page))

        abbrev = {'upper': 'S', 'lower': 'H'}[chamber]

        for row in page:
            bill_id = row['bill_num']
            if not bill_id[0] == abbrev:
                continue

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id,
                        row['bill_title'].decode('latin-1'),
                        type=bill_type)
            bill.add_source(info_url)

            self.scrape_bill_page(bill)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsor('introducer', introducer)

            bill['subjects'] = self._subjects[bill_id]

            self.bills[bill_id] = bill
Esempio n. 4
0
    def scrape_bill(self, chamber, session, bill_id):
        """
        Scrapes documents, actions, vote counts and votes for
        a given bill.
        """
        session_id = self.get_session_id(session)
        bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber])
        response = self.get(bill_json_url)
        page = json.loads(response.content)

        bill_title = page['ShortTitle']
        bill_id = page['Number']
        internal_id = page['BillId']
        bill_type = self.get_bill_type(bill_id)

        bill = Bill(
            session=session,
            chamber=chamber,
            bill_id=bill_id,
            title=bill_title,
            type=bill_type
        )
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, internal_id)
        self.scrape_sponsors(bill, internal_id)
        self.scrape_subjects(bill, internal_id)

        bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format(internal_id, session_id)
        bill.add_source(bill_url)

        bill = self.sort_bill_actions(bill)

        self.save_bill(bill)
Esempio n. 5
0
    def scrape_regular_row(self, chamber, session, row):
        """Returns bill attributes from row."""
        params = {}
        params['session'] = session
        params['chamber'] = chamber

        b = row.xpath('td/font/a[contains(@id, "HyperLink1")]')
        if b: # Ignore if no match
            bill_status_url = b[0].attrib['href']
            bill_url = row.xpath('td/font/span[contains(@id, "_Label2")]')[0].text
            params['bill_id'] = b[0].xpath('font')[0].text.split()[0]
            params['title'] = row.xpath('td/font/span[contains(@id, "_Label1")]/u/font')[0].text
            subject = row.xpath('td/font/span[contains(@id, "_Label6")]')[0].text
            subject = subject.replace('RELATING TO ', '') # Remove lead text
            params['subjects'] = [subject.replace('.', '')]
            params['description'] = row.xpath('td/font/span[contains(@id, "_Label2")]')[0].text
            sponsors = row.xpath('td/font/span[contains(@id, "_Label7")]')[0].text
            params['companion'] = row.xpath('td/font/span[contains(@id, "_Label8")]')[0].text
            bill = Bill(**params)
            for sponsor in sponsors.split(', '):
                bill.add_sponsor('primary', sponsor)
            actions = self.scrape_actions(bill, bill_status_url)
            bill.add_source(bill_status_url)
            self.save_bill(bill)
        return
Esempio n. 6
0
    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, biennium, bill_num))

        with self.urlopen(url) as page:
            page = lxml.etree.fromstring(page).xpath("//wa:Legislation",
                                                     namespaces=self._ns)[0]

            title = page.xpath("string(wa:LongDescription)",
                               namespaces=self._ns)

            bill_type = page.xpath(
                "string(wa:ShortLegislationType/wa:LongLegislationType)",
                namespaces=self._ns).lower()

            if bill_type == 'gubernatorial appointment':
                return

            bill = Bill(session, chamber, bill_id, title,
                        type=[bill_type])

            chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber]
            version_url = ("http://www.leg.wa.gov/pub/billinfo/2011-12/"
                           "Htm/Bills/%s %ss/%s.htm" % (chamber_name,
                                                        bill_type.title(),
                                                        bill_num))
            bill.add_version(bill_id, version_url)

            self.scrape_sponsors(bill)
            self.scrape_actions(bill)

            self.save_bill(bill)
Esempio n. 7
0
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error('Skipping %s w/ 503', bill_url)
                return
            else:
                raise

        bill_number = page.xpath('//div[contains(@class,"field-name-field-bill-number")]'
                                 '//div[contains(@class,"field-item even")][1]/text()')[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath('string(//div[contains(@class,"field-name-field-bill-summary")])')
        bill_summary = bill_summary.strip()

        bill = Bill(session, chamber, bill_number, bill_title, summary=bill_summary)

        bill.add_source('{}{}'.format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_votes(bill, page)
        self.scrape_amendments(bill, page)

        self.save_bill(bill)
Esempio n. 8
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1)

        if type_abbr == 'B':
            btype = ['bill']
        elif type_abbr == 'R':
            btype = ['resolution']

        bill_id = "%s%s %s" % (bill_abbr(chamber), type_abbr, bill_num)

        url = info_url(chamber, session, special, type_abbr, bill_num)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath(
                "//td[text() = 'Short Title:']/following-sibling::td")[0]
            title = title.text.strip()

            bill = Bill(session, chamber, bill_id, title, type=btype)
            bill.add_source(url)

            self.parse_bill_versions(bill, page)

            self.parse_history(bill, history_url(chamber, session, special,
                                                 type_abbr, bill_num))

            self.parse_votes(bill, vote_url(chamber, session, special,
                                            type_abbr, bill_num))

            self.save_bill(bill)
Esempio n. 9
0
    def scrape_bill_info(self, chamber, session):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        page = self.urlopen(info_url)
        page = csv.DictReader(StringIO.StringIO(page))

        abbrev = {"upper": "S", "lower": "H"}[chamber]

        for row in page:
            bill_id = row["bill_num"]
            if not bill_id[0] == abbrev:
                continue

            if re.match(r"^(S|H)J", bill_id):
                bill_type = "joint resolution"
            elif re.match(r"^(S|H)R", bill_id):
                bill_type = "resolution"
            else:
                bill_type = "bill"

            bill = Bill(session, chamber, bill_id, row["bill_title"].decode("latin-1"), type=bill_type)
            bill.add_source(info_url)

            self.scrape_bill_page(bill)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsor("introducer", introducer)

            bill["subjects"] = self._subjects[bill_id]

            self.bills[bill_id] = bill
Esempio n. 10
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Get the basic parts of the bill
        bill_id = self.get_node(doc, '//h1/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(doc, '//h2[text()[contains(.,'
            '"Description")]]/following-sibling::p/text()')
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(doc, '//a[text()[contains(.,'
                '"Long Description")]]/@href')
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(long_desc_page, '//h1/'
                'following-sibling::p/text()')
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = 'No title found.'
                self.logger.warning('No title found for {}.'.format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {'F': 'bill', 'R':'resolution',
                     'C': 'concurrent resolution'}[bill_id[1]]
        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        # Add subjects.  Currently we are not mapping to Open States
        # standardized subjects, so use 'scraped_subjects'
        bill['scraped_subjects'] = self._subject_mapping[bill_id]

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
          bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        self.save_bill(bill)
Esempio n. 11
0
    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, biennium, bill_num))

        page = self.urlopen(url)
        page = lxml.etree.fromstring(page.bytes)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page,
            "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(session, chamber, bill_id, title,
                    type=[bill_type])

        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" % (
                           bill_num, session[0:4]))
        bill.add_source(fake_source)

        chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber]
        mimetype = 'text/html'
        version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/"
                       "Htm/Bills/%s %ss/%s.htm" % (biennium,
                                                    chamber_name,
                                                    bill_type.title(),
                                                    bill_num))

        # Sometimes the measure's version_url isn't guessable. When that happens
        # have to get the url from the source page.
        try:
            version_resp = self.get(version_url)
            if version_resp.status_code != 200:
                webpage = self.get(fake_source).text
                webdoc = lxml.html.fromstring(webpage)
                version_url = webdoc.xpath('//a[contains(@href, "billdocs")]/@href')[-1]
                if version_url.lower().endswith('.pdf'):
                    mimetype = 'application/pdf'
        except scrapelib.HTTPError:
            pass

        bill.add_version(bill_id, version_url, mimetype=mimetype)

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_votes(bill)
        self.fix_prefiled_action_dates(bill)

        return bill
Esempio n. 12
0
    def parse_senate_billpage(self, bill_url, year):
        bill_page = self.urlopen(bill_url)
        bill_page = lxml.html.fromstring(bill_page)
        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath(
            '//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath(
            '//*[@id="lblBriefDesc"]')[0].text_content()
        bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
        # print "bill id = "+ bill_id

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self.subjects:
            subs = self.subjects[bid]
            self.log("With subjects for this bill")

        self.log(bid)

        bill = Bill(year, 'upper', bill_id, bill_desc,
                    bill_lr=bill_lr, type=bill_type, subjects=subs)
        bill.add_source(bill_url)

        # Get the primary sponsor
        sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0]
        bill_sponsor = sponsor.text_content()
        bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsor('primary', bill_sponsor,
                         sponsor_link=bill_sponsor_link)

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key('href'):
            self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//*[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib['href']
            # print "actions = %s" % action_url
            self.parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//*[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.has_key('href'):
            self.parse_senate_bill_versions(
                bill, versions_url[0].attrib['href'])

        self.save_bill(bill)
Esempio n. 13
0
    def scrape(self, chamber, session):
        self.log(self.metadata['session_details'])
        self.site_id = self.metadata['session_details'][session]['internal_id']
        chamber_piece = {'upper': 'Senate',
                         'lower': 'House+of+Representatives'}[chamber]

        # resolutions
        # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction

        url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece

        cookie = self.refresh_session()

        agent = FakeFirefoxURLopener()
        agent.addheader('Cookie', cookie)
        page = agent.open(url)
        doc = lxml.html.fromstring(page.read())

        # bills are all their own table with cellspacing=4 (skip first)
        bill_tables = doc.xpath('//table[@cellspacing="4"]')
        for bt in bill_tables[1:]:

            # each table has 3 rows: detail row, description, blank
            details, desc, _ = bt.xpath('tr')

            # first <tr> has img, button, sponsor, topic, current house
            #   current status, committee, committee2, last action
            _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td')

            # pull bill_id out of script tag (gross)
            bill_id = bill_id_re.search(button.text_content()).group()
            self.log(bill_id)
            oid = btn_re.search(button.text_content()).groups()[0]

            sponsor = sponsor.text_content()
            topic = topic.text_content()
            com1 = com1.text_content()
            com2 = com2.text_content()
            desc = desc.text_content()

            # create bill
            bill = Bill(session, chamber, bill_id, desc.strip(),
                        topic=topic)
            bill.add_sponsor(sponsor, 'primary')

            self.get_sponsors(bill, oid)
            self.get_actions(bill, oid)

            # craft bill URL
            session_fragment = '2010rs'
            type_fragment = 'bills'
            bill_id_fragment = bill_id.lower()
            bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % (
                session_fragment, type_fragment, bill_id_fragment)
            bill.add_version('bill text', bill_text_url)

            self.save_bill(bill)
Esempio n. 14
0
    def scrape(self, session, chambers):
        urlified_session_id = session.replace(':', '-')
        url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projets-loi-%s.html' % urlified_session_id
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        # scrape all the actions for this session
        actions = self.scrape_actions(urlified_session_id)

        for row in doc.xpath('//table[@id="tblListeProjetLoi"]/tbody/tr'):
            id_td, details_td = row.xpath('td')[:2]
            bill_id = clean_spaces(id_td.text_content())
            pdf_link = details_td.xpath('p[@class="lienAssocie"]//a')[0]
            bill_name = clean_spaces(pdf_link.text_content())
            pdf_url = pdf_link.xpath('@href')[0]
            detail_url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projet-loi-%s-%s.html' % (bill_id, urlified_session_id)
            bill = Bill(session, 'lower', bill_id, bill_name)
            bill.add_source(url)
            bill.add_source(detail_url)
            bill.add_source(pdf_url)
            # add actions
            for action in actions[bill_id]:
                bill.add_action('lower', action['name'], action['date'])
            # get sponsors
            self.scrape_details(bill, detail_url)
            self.save_bill(bill)
Esempio n. 15
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billindex/"
               "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev))
        page = lxml.html.fromstring(self.urlopen(url))

        for tr in page.xpath("//tr[@valign='middle']")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') +
                      tr.xpath('td[10]//a')):
                bill.add_version(a.text, a.get('href'))

            # documents
            fnote = tr.xpath('td[7]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[12]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Esempio n. 16
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('#legislation h1')[0].text_content().strip()

            bill_id = name.split(' - ')[0].strip()

            bill = Bill(session, chamberName, bill_id, name)

            # Sponsorships
            for a in page.cssselect("#sponsors a"):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in page.cssselect('#history tr')[1:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()

                if '/' not in date:
                    continue

                date = datetime.datetime.strptime(date, '%m/%d/%Y')

                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in page.cssselect('#versions a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Esempio n. 17
0
    def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
        try:
            doc = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e)
            self.warning("500 error for bill page; skipping bill")
            return

        # bill id, title, summary
        bill_num = re.findall('DocNum=(\d+)', url)[0]
        bill_type = bill_type or DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip()
        summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip()

        bill = Bill(session, chamber, bill_id, title, type=bill_type,
                    summary=summary)

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]'))
        # don't add just yet; we can make them better using action data

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(),
                                              "%m/%d/%Y")
            actor = actor.text_content()
            if actor == 'House':
                actor = 'lower'
            elif actor == 'Senate':
                actor = 'upper'

            action = action.text_content()
            bill.add_action(actor, action, date,
                            **_categorize_action(action))
            if action.lower().find('sponsor') != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype, sponsor, chamber, official_type in sponsor_list:
            if chamber:
                bill.add_sponsor(spontype, sponsor,
                                 official_type=official_type, chamber=chamber)
            else:
                bill.add_sponsor(spontype, sponsor,
                                 official_type=official_type)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)

        # if there's more than 1 votehistory link, there are votes to grab
        if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1:
            votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
            self.scrape_votes(session, bill, votes_url)

        self.save_bill(bill)
Esempio n. 18
0
    def _parse_bill(self, session, chamber, source_url, line):
        if line:
            (type, combined_id, number, title, relating_to) = line.split("\xe4")
            if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'):
                #
                # basic bill info
                bill_id = "%s %s" % (type, number.zfill(4))
                bill = Bill(session, chamber, bill_id, title)
                bill.add_source(source_url)

                #
                # add actions
                if self.actionsByBill.has_key(bill_id):
                    for a in self.actionsByBill[bill_id]:
                        bill.add_action(a['actor'], a['action'], a['date'])

                if self.load_versions_sponsors:
                    # add versions and sponsors
                    versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(self, session, bill_id)
                    #print "versionsSponsors: %s" % str(versionsSponsors)
                    if versionsSponsors:
                        for ver in versionsSponsors['versions']:
                            bill.add_version(ver['name'], ver['url'])
                        sponsorType = 'primary'
                        if len(versionsSponsors['sponsors']) > 1:
                            sponsorType = 'cosponsor'
                        for name in versionsSponsors['sponsors']:
                            bill.add_sponsor(sponsorType, name)

                # save - writes out JSON
                self.save_bill(bill)
Esempio n. 19
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
            bill.add_sponsor('primary', author.strip())

            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue


                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
                action = tds[1].text_content()
                bill.add_action(chamber, action, date)

                # also has an associated version
                if tds[1].xpath('a'):
                    bill.add_version(action, tds[1].xpath('a/@href')[0])

            bill.add_source(url)
            self.save_bill(bill)
Esempio n. 20
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table')

            # Bill
            name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[2].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Esempio n. 21
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('center table center')[0]

            for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table')[-1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Esempio n. 22
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect("table")

            # Bill
            name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version("Current", url.replace("/sum/", "/fulltext/"))

            # Sponsorships
            for a in tables[2].cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect("tr"):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if "/" not in senate_date and "/" not in house_date:
                    continue
                if senate_date:
                    bill.add_action("upper", action_text, senate_date)
                if house_date:
                    bill.add_action("lower", action_text, house_date)

            self.save_bill(bill)
Esempio n. 23
0
    def scrape2001(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect("table center table")

            # Bill
            name = tables[0].text_content().split("-", 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect("a"):
                bill.add_sponsor("", a.text_content().strip())

            # Actions
            center = page.cssselect("table center")[-1]

            for row in center.cssselect("table table")[0].cssselect("tr")[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if "/" not in date:
                    continue
                if action_text.startswith("Senate"):
                    action_text = action_text.split(" ", 1)[1].strip()
                    bill.add_action("upper", action_text, date)
                elif action_text.startswith("House"):
                    action_text = action_text.split(" ", 1)[1].strip()
                    bill.add_action("lower", action_text, date)

            # Versions
            for row in center.cssselect("table table")[1].cssselect("a"):
                bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href")))

            self.save_bill(bill)
Esempio n. 24
0
    def scrape_current(self, chamber, term):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                # main
                bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'],
                            type=btype, status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if bill_data['LONGTITLE']:
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper' if event['chamber'] == 'Senate'
                             else 'lower')

                    date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning('unknown action code on %s: %s %s' %
                                     (bill_id, event['action_code'],
                                      event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                self.scrape_html(bill)
                self.save_bill(bill)
Esempio n. 25
0
 def scrape_bill_status_page(self, url, params={}):
     """Scrapes the status page url, populating parameter dict and
     returns bill
     """
     with self.urlopen(url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(url)
         params["bill_id"] = page.xpath('//h3[contains(@class, "center")]/a')[0].text.split()[0]
         params["title"] = page.xpath(
             '//div[div[contains( \
             ., "Report Title")]]/div[contains(@class, "rightside")]'
         )[0].text.strip()
         sponsors = page.xpath(
             '//div[div[contains( \
             ., "Introducer")]]/div[contains(@class, "rightside")]'
         )[0].text
         subject = page.xpath(
             '//div[div[contains( \
             ., "Measure Title")]]/div[contains(@class, "rightside")]'
         )[0].text.strip()
         subject = subject.replace("RELATING TO ", "")  # Remove lead text
         params["subject"] = subject.replace(".", "")
         params["description"] = page.xpath(
             '//div[div[contains( \
             ., "Description")]]/div[contains(@class, "rightside")]'
         )[0].text
         params["companion"] = page.xpath(
             '//div[div[contains( \
             ., "Companion")]]/div[contains(@class, "rightside")]'
         )[0].text
         if params["title"] == "":
             params["title"] = params["subject"]
         actions = []
         table = page.xpath('//table[tr/th[contains(., "Date")]]')[0]
         for row in table.xpath("tr[td]"):  # Ignore table header row
             action_params = {}
             cells = row.xpath("td")
             if len(cells) == 3:
                 ch = cells[1].text
                 action_params["actor"] = house[ch]
                 action_params["action"] = cells[2].text
                 action_date = cells[0].text.split()[0]  # Just get date, ignore any time.
                 try:
                     action_params["date"] = datetime.strptime(action_date, "%m/%d/%y")
                 except ValueError:  # Try a YYYY format.
                     action_params["date"] = datetime.strptime(action_date, "%m/%d/%Y")
                 actions.append(action_params)
         bill = Bill(**params)
         bill.add_sponsor("primary", sponsors)
         for action_params in actions:
             bill.add_action(**action_params)
     self.save_bill(bill)
     return bill
Esempio n. 26
0
    def scrape_xml(self, chamber, session):
        start_letter = "S" if chamber == "upper" else "H"
        sponsor_type_dict = {"3": "senate cosponsor", "4": "sponsor", "5": "sponsor"}
        version_url = "http://www1.legis.ga.gov/legis/%s/versions/" % session

        summary_url = "http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml" % session
        xml = self.urlopen(summary_url)
        doc = lxml.etree.fromstring(xml)

        for bxml in doc.xpath("//Bill"):
            type = bxml.get("Type")

            # if this is from the other chamber skip it
            if not type.startswith(start_letter):
                continue

            bill_id = type + bxml.get("Num") + bxml.get("Suffix")
            if type in ("HB", "SB"):
                type = "bill"
            elif type in ("HR", "SR"):
                type = "resolution"
            else:
                raise ValueError("unknown type: %s" % type)

            # use short_title as title and long as description
            title = bxml.xpath("Short_Title/text()")[0]
            description = bxml.xpath("Title/text()")[0]

            bill = Bill(session, chamber, bill_id, title, type=type, description=description)
            bill.add_source(summary_url)

            for sponsor in bxml.xpath("Sponsor"):
                sponsor_name, code = sponsor.text.rsplit(" ", 1)
                sponsor_name = sponsor_name.replace(",", ", ")
                bill.add_sponsor(sponsor_type_dict[sponsor.get("Type")], sponsor_name, _code=code)

            for version in bxml.xpath("Versions/Version"):
                # NOTE: it is possible to get PDF versions by using .get('Id')
                # ex. URL:  legis.ga.gov/Legislation/20112012/108025.pdf
                # for now we just get HTML
                description, file_id = version.xpath("*/text()")
                bill.add_version(description, version_url + file_id)

            for action in bxml.xpath("StatusHistory/Status"):
                date = datetime.datetime.strptime(action.get("StatusDate"), "%Y-%m-%dT%H:%M:%S")
                code = action.get("StatusCode")
                if code in ("EFF", "Signed Gov"):
                    actor = "executive"
                elif code[0] == "S":
                    actor = "upper"
                elif code[0] == "H":
                    actor = "lower"

                atype = self._action_codes[code]

                bill.add_action(actor, action.text, date, atype)

            self.save_bill(bill)
Esempio n. 27
0
    def scrape_bill(self, term, bill_url):

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            
            chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text
            
            if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0:
            
                chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text

                if '*' in chamber1:
                    bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
                    secondary_bill_id = chamber2.replace(' ', '')
                else:
                    bill_id = chamber2.replace(' ', '')[1:len(chamber2)]
                    secondary_bill_id = chamber1.replace(' ', '')
                
                primary_chamber = 'lower' if 'H' in bill_id else 'upper'

            else:
                primary_chamber = 'lower' if 'H' in chamber1 else 'upper'
                bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
                secondary_bill_id = None
            
            title = page.xpath("//span[@id='lblAbstract']")[0].text

            bill = Bill(term, primary_chamber, bill_id, title, secondary_bill_id=secondary_bill_id)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                #NEED TO ADD SECONDARY ACTIONS
                bill.add_action(primary_chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)
Esempio n. 28
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        html = self.urlopen(url)
        if "error '80020009'" in html:
            self.warning('asp error on page, skipping %s', bill_id)
            return
        doc = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
        if not title:
            raise NoSuchBill()
        bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
        author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
        for aname in author.split(','):
            aname = self.clean_name(aname).strip()
            if aname:
                bill.add_sponsor('primary', aname)
        co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()')
        if len(co_authors) != 0:
            for co_author in co_authors[1].split(','):
                bill.add_sponsor('cosponsor', self.clean_name(co_author).strip());
        action_table = doc.xpath('//table')[-1]
        for row in action_table[1:]:
            tds = row.xpath('td')
            # ignore row missing date
            if len(tds) != 2:
                continue
            if tds[0].text_content():
                date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y")
            action = tds[1].text_content().strip()
            #parse the text to see if it's a new version or a unrelated document
            #if has - let's *shrug* assume it's a vote document

            #get url of action
            action_url = tds[1].xpath('a/@href')
            atype,action = self.parse_action(chamber,bill,action,action_url,date)
            if atype == 'bill:passed' and action_url:
                vote_chamber  = None
                for pattern, vote_chamber in _voteChambers:
                   if re.match(pattern,action):
                       break

                else:
                   self.warning('coudnt find voteChamber pattern')

                if vote_chamber == 'lower' and len(action_url) > 0:
                    vote = self.scrape_votes(action_url[0], action,date,
                                             vote_chamber)
                    if not vote[0] == None:
                        vote[0].add_source(action_url[0])
                        bill.add_vote(vote[0])
                    else:
                        self.warning('Problem Reading vote: %s,%s' %
                                     (vote[1], bill_id))

        bill.add_source(url)
        self.save_bill(bill)
Esempio n. 29
0
    def scrape(self, chamber, session):
        year = year_from_session(session)
        url = bills_url(year)
        with self.urlopen(url) as bills_page_html:
            bills_page = lxml.html.fromstring(bills_page_html)
            table_rows = bills_page.cssselect('tr')
            # Eliminate empty rows
            table_rows = table_rows[0:len(table_rows):2]
            for row in table_rows:
                row_elements = row.cssselect('td')

                bill_document = row_elements[0]
                bill_document.make_links_absolute(BASE_URL)

                element, attribute, link, pos = bill_document.iterlinks().next()
                bill_id = element.text_content().rstrip('.pdf')
                bill_document_link = link

                title_and_sponsors = row_elements[1]
                title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content())
                sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content())
                title = title_match.group(1)
                sponsors =  sponsors_match.group(1)
                separated_sponsors = sponsors.split('--')

                bill = Bill(session, chamber, bill_id, title)
                bill.add_version('current', bill_document_link)

                if separated_sponsors[1] == '(NONE)':
                    bill.add_sponsor('primary', separated_sponsors[0])

                else:
                    bill.add_sponsor('cosponsor', separated_sponsors[0])
                    bill.add_sponsor('cosponsor', separated_sponsors[1])


                versions_page_element = row_elements[2]
                versions_page_element.make_links_absolute(BASE_URL)
                element, attribute, link, pos = versions_page_element.iterlinks().next()

                bill.add_source(link)

                self.scrape_versions(link, bill)

                actions_page_element = row_elements[3]
                element, attribute, link, pos = actions_page_element.iterlinks().next()
                frame_link = BASE_URL + link.split('?Open&target=')[1]

                self.scrape_actions(frame_link, bill)

                votes_page_element = row_elements[7]
                element, attribute, link, pos = votes_page_element.iterlinks().next()
                frame_link = BASE_URL + link.split('?Open&target=')[1]
                self.scrape_votes(frame_link, chamber, bill)
Esempio n. 30
0
    def scrape_bill_info(self, chamber, session):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        page = urllib2.urlopen(info_url)
        page = csv.DictReader(page)

        abbrev = {'upper': 'S', 'lower': 'H'}[chamber]

        for row in page:
            bill_id = row['bill_num']
            if not bill_id[0] == abbrev:
                continue

            bill = Bill(session, chamber, bill_id, row['bill_title'])
            bill.add_source(info_url)
            self.bills[bill_id] = bill