def update_bill_version_list(only_congress): bill_versions = { } # Which sitemap years should we look at? if not only_congress: sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml") else: # If --congress=X is specified, only look at the relevant years. sitemap_files = [utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress)] sitemap_files = [f for f in sitemap_files if os.path.exists(f)] # For each year-by-year BILLS sitemap... for year_sitemap in sitemap_files: dom = etree.parse(year_sitemap).getroot() if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.") # Loop through each bill text version... for file_node in dom.xpath("x:url", namespaces=ns): # get URL and last modified date url = str(file_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns)) # extract bill congress, type, number, and version from the URL m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url) if not m: raise Exception("Unmatched bill document URL: " + url) congress, bill_type, bill_number, version_code = m.groups() congress = int(congress) if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url) # If --congress=XXX is specified, only look at those bills. if only_congress and congress != only_congress: continue # Track the documents by congress, bill type, etc. bill_versions\ .setdefault(congress, { })\ .setdefault(bill_type, { })\ .setdefault(bill_number, { })\ [version_code] = { "url": url, "lastmod": lastmod, } # Output the bill version info. We can't do this until the end because we need to get # the complete list of versions for a bill before we write the file, and the versions # may be split across multiple sitemap files. for congress in bill_versions: for bill_type in bill_versions[congress]: for bill_number in bill_versions[congress][bill_type]: utils.write( json.dumps(bill_versions[congress][bill_type][bill_number], sort_keys=True, indent=2, default=utils.format_datetime), output_for_bill(congress, bill_type, bill_number, "text-versions.json") )
def get_output_path(year, collection, package_name, granule_name, options): # Where to store the document files? # The path will depend a bit on the collection. if collection == "BILLS": # Store with the other bill data. bill_id, version_code = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress")) if not bill_id: return None # congress number does not match options["congress"] return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False) else: # Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME]. path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name) if granule_name: path += "/" + granule_name return path
def get_output_path(year, collection, package_name, granule_name, options): # Where to store the document files? # The path will depend a bit on the collection. if collection == "BILLS": # Store with the other bill data. bill_and_ver = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress")) if not bill_and_ver: return None # congress number does not match options["congress"] bill_id, version_code = bill_and_ver return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False) else: # Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME]. path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name) if granule_name: path += "/" + granule_name return path
def run(options): amendment_id = options.get('amendment_id', None) bill_id = options.get('bill_id', None) search_state = {} if amendment_id: amendment_type, number, congress = utils.split_bill_id(amendment_id) to_fetch = [amendment_id] elif bill_id: # first, crawl the bill bill_type, number, congress = utils.split_bill_id(bill_id) bill_status = fetch_bill(bill_id, options) if bill_status['ok']: bill = json.loads(utils.read(output_for_bill(bill_id, "json"))) to_fetch = [x["amendment_id"] for x in bill["amendments"]] else: logging.error("Couldn't download information for that bill.") return None else: congress = options.get('congress', utils.current_congress()) to_fetch = bill_ids_for(congress, utils.merge(options, {'amendments': True}), bill_states=search_state) if not to_fetch: if options.get("fast", False): logging.warn("No amendments changed.") else: logging.error( "Error figuring out which amendments to download, aborting." ) return None limit = options.get('limit', None) if limit: to_fetch = to_fetch[:int(limit)] if options.get('pages_only', False): return None logging.warn("Going to fetch %i amendments from congress #%s" % (len(to_fetch), congress)) saved_amendments = utils.process_set(to_fetch, fetch_amendment, options) # keep record of the last state of all these amendments, for later fast-searching save_bill_search_state(saved_amendments, search_state)
def run(options): amendment_id = options.get('amendment_id', None) bill_id = options.get('bill_id', None) search_state = { } if amendment_id: amendment_type, number, congress = utils.split_bill_id(amendment_id) to_fetch = [amendment_id] elif bill_id: # first, crawl the bill bill_type, number, congress = utils.split_bill_id(bill_id) bill_status = fetch_bill(bill_id, options) if bill_status['ok']: bill = json.loads(utils.read(output_for_bill(bill_id, "json"))) to_fetch = [x["amendment_id"] for x in bill["amendments"]] else: logging.error("Couldn't download information for that bill.") return None else: congress = options.get('congress', utils.current_congress()) to_fetch = bill_ids_for(congress, utils.merge(options, {'amendments': True}), bill_states=search_state) if not to_fetch: if options.get("fast", False): logging.warn("No amendments changed.") else: logging.error("Error figuring out which amendments to download, aborting.") return None limit = options.get('limit', None) if limit: to_fetch = to_fetch[:int(limit)] if options.get('pages_only', False): return None logging.warn("Going to fetch %i amendments from congress #%s" % (len(to_fetch), congress)) saved_amendments = utils.process_set(to_fetch, fetch_amendment, options) # keep record of the last state of all these amendments, for later fast-searching save_bill_search_state(saved_amendments, search_state)