def scrape_session(self, chamber, session): billdirs_path = '/bills/%s/billhistory/%s_bills/' % ( session, chamber_name(chamber)) billdirs_url = urlparse.urljoin(self._ftp_root, billdirs_path) with self.urlopen(billdirs_url) as bill_dirs: for dir in parse_ftp_listing(bill_dirs): bill_url = urlparse.urljoin(billdirs_url, dir) + '/' with self.urlopen(bill_url) as bills: for history in parse_ftp_listing(bills): self.scrape_bill(chamber, session, urlparse.urljoin(bill_url, history))
def scrape_bill(self, chamber, session, url): with self.urlopen(url) as data: bill = self.parse_bill_xml(chamber, session, data) bill.add_source(url) versions_url = url.replace("billhistory", "billtext/html") versions_url = "/".join(versions_url.split("/")[0:-1]) bill_prefix = bill["bill_id"].split()[0] bill_num = int(bill["bill_id"].split()[1]) long_bill_id = "%s%05d" % (bill_prefix, bill_num) try: with self.urlopen(versions_url) as versions_list: bill.add_source(versions_url) for version in parse_ftp_listing(versions_list): if version.startswith(long_bill_id): version_name = version.split(".")[0] version_url = urlparse.urljoin(versions_url + "/", version) bill.add_version(version_name, version_url) except urllib2.URLError: # Sometimes the text is missing pass self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if len(session) == 2: session = "%sR" % session for btype in ["bills", "concurrent_resolutions", "joint_resolutions", "resolutions"]: billdirs_path = "/bills/%s/billhistory/%s_%s/" % (session, chamber_name(chamber), btype) billdirs_url = urlparse.urljoin(self._ftp_root, billdirs_path) with self.urlopen(billdirs_url) as bill_dirs: for dir in parse_ftp_listing(bill_dirs): bill_url = urlparse.urljoin(billdirs_url, dir) + "/" with self.urlopen(bill_url) as bills: for history in parse_ftp_listing(bills): self.scrape_bill(chamber, session, urlparse.urljoin(bill_url, history))
def scrape_bill(self, chamber, session, url): with self.urlopen(url) as data: bill = self.parse_bill_xml(chamber, session, data) bill.add_source(url) versions_url = url.replace('billhistory', 'billtext/html') # URLs for versions inexplicably (H|S)(J|C) instead of (H|J)(CR|JR) versions_url = versions_url.replace('JR', 'J').replace('CR', 'C') versions_url = '/'.join(versions_url.split('/')[0:-1]) bill_prefix = bill['bill_id'].split()[0] bill_num = int(bill['bill_id'].split()[1]) long_bill_id = "%s%05d" % (bill_prefix, bill_num) try: with self.urlopen(versions_url) as versions_list: bill.add_source(versions_url) for version in parse_ftp_listing(versions_list): if version.startswith(long_bill_id): version_name = version.split('.')[0] version_url = urlparse.urljoin(versions_url + '/', version) bill.add_version(version_name, version_url) except urllib2.URLError: # Sometimes the text is missing pass self.save_bill(bill)
def scrape_session(self, chamber, session): journal_root = urlparse.urljoin(self._ftp_root, ("/journals/" + session + "/html/"), True) if chamber == 'lower': journal_root = urlparse.urljoin(journal_root, "house/", True) else: journal_root = urlparse.urljoin(journal_root, "senate/", True) with self.urlopen(journal_root) as listing: for name in parse_ftp_listing(listing): if not name.startswith('81'): continue url = urlparse.urljoin(journal_root, name) self.scrape_journal(url, chamber)
def scrape_bill(self, chamber, session, url): with self.urlopen(url) as data: bill = self.parse_bill_xml(chamber, session, data) bill.add_source(url) versions_url = url.replace('billhistory', 'billtext/html') versions_url = '/'.join(versions_url.split('/')[0:-1]) bill_prefix = bill['bill_id'].split()[0] bill_num = int(bill['bill_id'].split()[1]) long_bill_id = "%s%05d" % (bill_prefix, bill_num) with self.urlopen(versions_url) as versions_list: bill.add_source(versions_url) for version in parse_ftp_listing(versions_list): if version.startswith(long_bill_id): version_name = version.split('.')[0] version_url = urlparse.urljoin(versions_url + '/', version) bill.add_version(version_name, version_url) self.save_bill(bill)