Ejemplo n.º 1
0
    def scrape_session(self, chamber, session):
        billdirs_path = '/bills/%s/billhistory/%s_bills/' % (
            session, chamber_name(chamber))
        billdirs_url = urlparse.urljoin(self._ftp_root, billdirs_path)

        with self.urlopen(billdirs_url) as bill_dirs:
            for dir in parse_ftp_listing(bill_dirs):
                bill_url = urlparse.urljoin(billdirs_url, dir) + '/'
                with self.urlopen(bill_url) as bills:
                    for history in parse_ftp_listing(bills):
                        self.scrape_bill(chamber, session,
                                         urlparse.urljoin(bill_url, history))
Ejemplo n.º 2
0
    def scrape_bill(self, chamber, session, url):
        with self.urlopen(url) as data:
            bill = self.parse_bill_xml(chamber, session, data)
            bill.add_source(url)

            versions_url = url.replace("billhistory", "billtext/html")
            versions_url = "/".join(versions_url.split("/")[0:-1])

            bill_prefix = bill["bill_id"].split()[0]
            bill_num = int(bill["bill_id"].split()[1])
            long_bill_id = "%s%05d" % (bill_prefix, bill_num)

            try:
                with self.urlopen(versions_url) as versions_list:
                    bill.add_source(versions_url)
                    for version in parse_ftp_listing(versions_list):
                        if version.startswith(long_bill_id):
                            version_name = version.split(".")[0]
                            version_url = urlparse.urljoin(versions_url + "/", version)
                            bill.add_version(version_name, version_url)
            except urllib2.URLError:
                # Sometimes the text is missing
                pass

            self.save_bill(bill)
Ejemplo n.º 3
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if len(session) == 2:
            session = "%sR" % session

        for btype in ["bills", "concurrent_resolutions", "joint_resolutions", "resolutions"]:
            billdirs_path = "/bills/%s/billhistory/%s_%s/" % (session, chamber_name(chamber), btype)
            billdirs_url = urlparse.urljoin(self._ftp_root, billdirs_path)

            with self.urlopen(billdirs_url) as bill_dirs:
                for dir in parse_ftp_listing(bill_dirs):
                    bill_url = urlparse.urljoin(billdirs_url, dir) + "/"
                    with self.urlopen(bill_url) as bills:
                        for history in parse_ftp_listing(bills):
                            self.scrape_bill(chamber, session, urlparse.urljoin(bill_url, history))
Ejemplo n.º 4
0
    def scrape_bill(self, chamber, session, url):
        with self.urlopen(url) as data:
            bill = self.parse_bill_xml(chamber, session, data)
            bill.add_source(url)

            versions_url = url.replace('billhistory', 'billtext/html')
            # URLs for versions inexplicably (H|S)(J|C) instead of (H|J)(CR|JR)
            versions_url = versions_url.replace('JR', 'J').replace('CR', 'C')
            versions_url = '/'.join(versions_url.split('/')[0:-1])

            bill_prefix = bill['bill_id'].split()[0]
            bill_num = int(bill['bill_id'].split()[1])
            long_bill_id = "%s%05d" % (bill_prefix, bill_num)

            try:
                with self.urlopen(versions_url) as versions_list:
                    bill.add_source(versions_url)
                    for version in parse_ftp_listing(versions_list):
                        if version.startswith(long_bill_id):
                            version_name = version.split('.')[0]
                            version_url = urlparse.urljoin(versions_url + '/',
                                                           version)
                            bill.add_version(version_name, version_url)
            except urllib2.URLError:
                # Sometimes the text is missing
                pass

            self.save_bill(bill)
Ejemplo n.º 5
0
    def scrape_session(self, chamber, session):
        journal_root = urlparse.urljoin(self._ftp_root, ("/journals/" +
                                                         session +
                                                         "/html/"),
                                        True)

        if chamber == 'lower':
            journal_root = urlparse.urljoin(journal_root, "house/", True)
        else:
            journal_root = urlparse.urljoin(journal_root, "senate/", True)

        with self.urlopen(journal_root) as listing:
            for name in parse_ftp_listing(listing):
                if not name.startswith('81'):
                    continue
                url = urlparse.urljoin(journal_root, name)
                self.scrape_journal(url, chamber)
Ejemplo n.º 6
0
    def scrape_bill(self, chamber, session, url):
        with self.urlopen(url) as data:
            bill = self.parse_bill_xml(chamber, session, data)
            bill.add_source(url)

            versions_url = url.replace('billhistory', 'billtext/html')
            versions_url = '/'.join(versions_url.split('/')[0:-1])

            bill_prefix = bill['bill_id'].split()[0]
            bill_num = int(bill['bill_id'].split()[1])
            long_bill_id = "%s%05d" % (bill_prefix, bill_num)

            with self.urlopen(versions_url) as versions_list:
                bill.add_source(versions_url)
                for version in parse_ftp_listing(versions_list):
                    if version.startswith(long_bill_id):
                        version_name = version.split('.')[0]
                        version_url = urlparse.urljoin(versions_url + '/',
                                                       version)
                        bill.add_version(version_name, version_url)

            self.save_bill(bill)