def bill_version_ids_for(only_congress, options): years = utils.get_congress_years(only_congress) only_bill_id = options.get('bill_id', None) version_ids = [] for year in years: # don't bother fetching future years if year > datetime.datetime.now().year: continue # ensure BILLS sitemap for this year is present entries = fdsys.entries_from_collection(year, "BILLS", None, options) # some future years may not be ready yet if not entries: continue for entry in entries: url, lastmod = entry congress, bill_id, bill_version_id = split_url(url) # a year may have other congresses in it if int(congress) != int(only_congress): continue # we may be focused on a single bill OD if only_bill_id and (bill_id != only_bill_id): continue version_ids.append(bill_version_id) return version_ids
def update_bill_version_list(only_congress): bill_versions = {} # Which sitemap years should we look at? if not only_congress: sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml") else: # If --congress=X is specified, only look at the relevant years. sitemap_files = [ utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress) ] sitemap_files = [f for f in sitemap_files if os.path.exists(f)] # For each year-by-year BILLS sitemap... for year_sitemap in sitemap_files: dom = etree.parse(year_sitemap).getroot() if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.") # Loop through each bill text version... for file_node in dom.xpath("x:url", namespaces=ns): # get URL and last modified date url = str(file_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns)) # extract bill congress, type, number, and version from the URL m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url) if not m: raise Exception("Unmatched bill document URL: " + url) congress, bill_type, bill_number, version_code = m.groups() congress = int(congress) if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url) # If --congress=XXX is specified, only look at those bills. if only_congress and congress != only_congress: continue # Track the documents by congress, bill type, etc. bill_versions.setdefault(congress, {}).setdefault(bill_type, {}).setdefault(bill_number, {})[ version_code ] = {"url": url, "lastmod": lastmod} # Output the bill version info. We can't do this until the end because we need to get # the complete list of versions for a bill before we write the file, and the versions # may be split across multiple sitemap files. for congress in bill_versions: for bill_type in bill_versions[congress]: for bill_number in bill_versions[congress][bill_type]: utils.write( json.dumps( bill_versions[congress][bill_type][bill_number], sort_keys=True, indent=2, default=utils.format_datetime, ), output_for_bill(congress, bill_type, bill_number, "text-versions.json"), )
def update_sitemap_cache(fetch_collections, options): seen_collections = set() # Load the root sitemap. master_sitemap = get_sitemap(None, None, None, options) if master_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type at the root sitemap.") # Process the year-by-year sitemaps. for year_node in master_sitemap.xpath("x:sitemap", namespaces=ns): # Get year and lastmod date. url = str(year_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(year_node.xpath("string(x:lastmod)", namespaces=ns)) m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/sitemap_(\d+).xml", url) if not m or m.group(1) != m.group(2): raise ValueError("Unmatched sitemap URL: %s" % url) year = m.group(1) # Should we process this year's sitemaps? if options.get("congress", None) and int(year) not in utils.get_congress_years(int(options.get("congress"))): continue if options.get("year", None) and int(year) != int(options.get("year")): continue # Get the sitemap. year_sitemap = get_sitemap(year, None, lastmod, options) if year_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type in %s sitemap." % year) # Process the collection sitemaps. for collection_node in year_sitemap.xpath("x:sitemap", namespaces=ns): # Get collection and lastmod date. url = str(collection_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(collection_node.xpath("string(x:lastmod)", namespaces=ns)) m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/(\d+)_(.*)_sitemap.xml", url) if not m or m.group(1) != year or m.group(2) != year: raise ValueError("Unmatched sitemap URL: %s" % url) collection = m.group(3) # To help the user find a collection name, record this collection but don't download it. if options.get("list-collections", False): seen_collections.add(collection) continue # Should we download the sitemap? if fetch_collections and collection not in fetch_collections: continue # Get the sitemap. collection_sitemap = get_sitemap(year, collection, lastmod, options) if collection_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type in %s_%s sitemap." % (year, collection)) if options.get("list-collections", False): print "\n".join(sorted(seen_collections))
def update_sitemap_cache(fetch_collections, options): """Updates a local cache of the complete FDSys sitemap tree. Pass fetch_collections as None, or to restrict the update to particular FDSys collections a set of collection names. Only downloads changed sitemap files.""" seen_collections = dict() # maps collection name to a set() of sitemap years in which the collection is present # Load the root sitemap. master_sitemap = get_sitemap(None, None, None, options) if master_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type at the root sitemap.") # Process the year-by-year sitemaps. for year_node in master_sitemap.xpath("x:sitemap", namespaces=ns): # Get year and lastmod date. url = str(year_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(year_node.xpath("string(x:lastmod)", namespaces=ns)) m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/sitemap_(\d+).xml", url) if not m or m.group(1) != m.group(2): raise ValueError("Unmatched sitemap URL: %s" % url) year = m.group(1) # Should we process this year's sitemaps? if options.get("congress", None) and int(year) not in utils.get_congress_years(int(options.get("congress"))): continue if options.get("year", None) and int(year) != int(options.get("year")): continue # Get the sitemap. year_sitemap = get_sitemap(year, None, lastmod, options) if year_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type in %s sitemap." % year) # Process the collection sitemaps. for collection_node in year_sitemap.xpath("x:sitemap", namespaces=ns): # Get collection and lastmod date. url = str(collection_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(collection_node.xpath("string(x:lastmod)", namespaces=ns)) m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/(\d+)_(.*)_sitemap.xml", url) if not m or m.group(1) != year or m.group(2) != year: raise ValueError("Unmatched sitemap URL: %s" % url) collection = m.group(3) # To help the user find a collection name, record this collection but don't download it. if options.get("list-collections", False): seen_collections.setdefault(collection, set()).add(int(year)) continue # Should we download the sitemap? if fetch_collections and collection not in fetch_collections: continue # Get the sitemap. collection_sitemap = get_sitemap(year, collection, lastmod, options) if collection_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type in %s_%s sitemap." % (year, collection)) if options.get("list-collections", False): max_collection_name_len = max(len(n) for n in seen_collections) def make_nice_year_range(years): ranges = [] for y in sorted(years): if len(ranges) > 0 and ranges[-1][1] == y-1: # extend the previous range ranges[-1][1] = y else: # append a new range ranges.append( [y, y] ) ranges = [(("%d" % r[0]) if r[0] == r[1] else "%d-%d" % tuple(r)) for r in ranges] return ", ".join(ranges) for collection in sorted(seen_collections): print collection.ljust(max_collection_name_len), " ", make_nice_year_range(seen_collections[collection])
def update_bill_version_list(only_congress): bill_versions = { } # Which sitemap years should we look at? if not only_congress: sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml") else: # If --congress=X is specified, only look at the relevant years. sitemap_files = [utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress)] sitemap_files = [f for f in sitemap_files if os.path.exists(f)] # For each year-by-year BILLS sitemap... for year_sitemap in sitemap_files: dom = etree.parse(year_sitemap).getroot() if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.") # Loop through each bill text version... for file_node in dom.xpath("x:url", namespaces=ns): # get URL and last modified date url = str(file_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns)) # extract bill congress, type, number, and version from the URL m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url) if not m: raise Exception("Unmatched bill document URL: " + url) congress, bill_type, bill_number, version_code = m.groups() congress = int(congress) if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url) # If --congress=XXX is specified, only look at those bills. if only_congress and congress != only_congress: continue # Track the documents by congress, bill type, etc. bill_versions\ .setdefault(congress, { })\ .setdefault(bill_type, { })\ .setdefault(bill_number, { })\ [version_code] = { "url": url, "lastmod": lastmod, } # Output the bill version info. We can't do this until the end because we need to get # the complete list of versions for a bill before we write the file, and the versions # may be split across multiple sitemap files. for congress in bill_versions: for bill_type in bill_versions[congress]: for bill_number in bill_versions[congress][bill_type]: utils.write( json.dumps(bill_versions[congress][bill_type][bill_number], sort_keys=True, indent=2, default=utils.format_datetime), output_for_bill(congress, bill_type, bill_number, "text-versions.json") )
def mirror_packages(fetch_collections, options): """Create a local mirror of FDSys document files. Only downloads changed files, according to the sitemap. Run update_sitemap_cache first. Pass fetch_collections as None, or to restrict the update to particular FDSys collections a set of collection names. Set options["store"] to a comma-separated list of file types (pdf, mods, text, xml, zip). """ # For determining whether we need to process a sitemap file again on a later # run, we need to make a key out of the command line arguments that affect # which files we are downloading. cache_options_key = repr(tuple(sorted(kv for kv in options.items() if kv[0] in ("store", "year", "congress", "granules", "cached")))) file_types = options["store"].split(",") # Process each FDSys sitemap... for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")): # Should we process this file? year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups() if "year" in options and year != options["year"]: continue if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])): continue if fetch_collections and collection not in fetch_collections: continue # Has this sitemap changed since the last successful mirror? # # The sitemap's last modification time is stored in ...-lastmod.txt, # which comes from the sitemap's parent sitemap's lastmod listing for # the file. # # Compare that to the lastmod value of when we last did a successful mirror. # This function can be run to fetch different sets of files, so get the # lastmod value corresponding to the current run arguments. sitemap_store_state_file = re.sub(r"\.xml$", "-store-state.json", sitemap) sitemap_last_mod = open(re.sub(r"\.xml$", "-lastmod.txt", sitemap)).read() if os.path.exists(sitemap_store_state_file): sitemap_store_state = json.load(open(sitemap_store_state_file)) if sitemap_store_state.get(cache_options_key) == sitemap_last_mod: # sitemap hasn't changed since the last time continue logging.info("scanning " + sitemap + "...") # Load the sitemap for this year & collection, and loop through each document. for package_name, lastmod in get_sitemap_entries(sitemap): # Add this package to the download list. file_list = [] if not options.get("granules", False): # Doing top-level package files (granule==None). file_list.append(None) else: # In some collections, like STATUTE, each document has subparts which are not # described in the sitemap. Load the main HTML page and scrape for the sub-files. # In the STATUTE collection, the MODS information in granules is redundant with # information in the top-level package MODS file. But the only way to get granule- # level PDFs is to go through the granules. content_detail_url = "http://www.gpo.gov/fdsys/pkg/%s/content-detail.html" % package_name content_index = utils.download(content_detail_url, "fdsys/package/%s/%s/%s.html" % (year, collection, package_name), utils.merge(options, { 'binary': True, })) if not content_index: raise Exception("Failed to download %s" % content_detail_url) for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"): if link.text == "More": m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href")) if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href")) granule_name = m.group(2) file_list.append(granule_name) # Download the files of the desired types. for granule_name in file_list: mirror_package(year, collection, package_name, lastmod, granule_name, file_types, options) # If we got this far, we successfully downloaded all of the files in this year/collection. # To speed up future updates, save the lastmod time of this sitemap in a file indicating # what we downloaded. The store-state file contains a JSON mapping of command line options # to the most recent lastmod value for this sitemap. sitemap_store_state = { } if os.path.exists(sitemap_store_state_file): sitemap_store_state = json.load(open(sitemap_store_state_file)) sitemap_store_state[cache_options_key] = sitemap_last_mod json.dump(sitemap_store_state, open(sitemap_store_state_file, "w"))
def update_sitemap_cache(fetch_collections, options): """Updates a local cache of the complete FDSys sitemap tree. Pass fetch_collections as None, or to restrict the update to particular FDSys collections a set of collection names. Only downloads changed sitemap files.""" seen_collections = set() # Load the root sitemap. master_sitemap = get_sitemap(None, None, None, options) if master_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type at the root sitemap.") # Process the year-by-year sitemaps. for year_node in master_sitemap.xpath("x:sitemap", namespaces=ns): # Get year and lastmod date. url = str(year_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(year_node.xpath("string(x:lastmod)", namespaces=ns)) m = re.match( r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/sitemap_(\d+).xml", url) if not m or m.group(1) != m.group(2): raise ValueError("Unmatched sitemap URL: %s" % url) year = m.group(1) # Should we process this year's sitemaps? if options.get("congress", None) and int(year) not in utils.get_congress_years( int(options.get("congress"))): continue if options.get("year", None) and int(year) != int(options.get("year")): continue # Get the sitemap. year_sitemap = get_sitemap(year, None, lastmod, options) if year_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type in %s sitemap." % year) # Process the collection sitemaps. for collection_node in year_sitemap.xpath("x:sitemap", namespaces=ns): # Get collection and lastmod date. url = str(collection_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str( collection_node.xpath("string(x:lastmod)", namespaces=ns)) m = re.match( r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/(\d+)_(.*)_sitemap.xml", url) if not m or m.group(1) != year or m.group(2) != year: raise ValueError("Unmatched sitemap URL: %s" % url) collection = m.group(3) # To help the user find a collection name, record this collection but don't download it. if options.get("list-collections", False): seen_collections.add(collection) continue # Should we download the sitemap? if fetch_collections and collection not in fetch_collections: continue # Get the sitemap. collection_sitemap = get_sitemap(year, collection, lastmod, options) if collection_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type in %s_%s sitemap." % (year, collection)) if options.get("list-collections", False): print "\n".join(sorted(seen_collections))
def mirror_files(fetch_collections, options): # Locally mirror certain file types for the specified collections. file_types = options["store"].split(",") for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")): # Should we process this file? year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups() if "year" in options and year != options["year"]: continue if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])): continue if fetch_collections and collection not in fetch_collections: continue logging.warn(sitemap + "...") # Load the sitemap for this year & collection. dom = etree.parse(sitemap).getroot() if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.") # Loop through each document in the collection in this year... for file_node in dom.xpath("x:url", namespaces=ns): # Get URL and last modified timestamp. url = str(file_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns)) if not url.endswith("/content-detail.html"): raise Exception("Unrecognized file pattern.") # Get the package name. m = re.match("http://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url) if not m: raise Exception("Unmatched document URL") package_name = m.group(1) # Where to store the document files? # The path will depend a bit on the collection. if collection == "BILLS": # Store with the other bill data. m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url) if not m: raise Exception("Unmatched bill document URL: " + url) congress, bill_type, bill_number, version_code = m.groups() congress = int(congress) if "congress" in options and congress != int(options["congress"]): continue path = output_for_bill(congress, bill_type, bill_number, "text-versions/" + version_code) else: # Store in fdsys/COLLECTION/YEAR/PKGNAME. path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name) # Do we need to update this record? lastmod_cache_file = path + "/lastmod.txt" cache_lastmod = utils.read(lastmod_cache_file) force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False) # Add this package to the download list. file_list = [] file_list.append((None, path)) if options.get("granules", False): # In some collections, like STATUTE, each document has subparts which are not # described in the sitemap. Load the main HTML page and scrape for the sub-files. # Josh originally thought the STATUTE granule files (individual statutes) were # useful, but then it turned out the information is redudant with information # in the top-level package MODS file. content_index = utils.download( url, "fdsys/package/%s/%s/%s.html" % (year, collection, package_name), utils.merge( options, { "xml": True, # it's not XML but this avoid unescaping HTML which fails if there are unicode characters "force": force, }, ), ) if not content_index: raise Exception("Failed to download %s" % url) for link in html.fromstring(content_index).cssselect( "table.page-details-data-table td.rightLinkCell a" ): if link.text == "More": m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href")) if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href")) granule_name = m.group(2) file_list.append((granule_name, path + "/" + granule_name)) # Download the files of the desired types. for granule_name, path in file_list: targets = get_package_files(package_name, granule_name, path) for file_type in file_types: if file_type not in targets: raise Exception("Invalid file type: %s" % file_type) f_url, f_path = targets[file_type] if force: logging.warn(f_path) data = utils.download( f_url, f_path, utils.merge(options, {"xml": True, "force": force, "to_cache": False}) ) if not data: raise Exception("Failed to download %s" % url) # Write the current last modified date to disk so we know the next time whether # we need to fetch the file. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file)