def update_bill_version_list(only_congress): bill_versions = {} # Which sitemap years should we look at? if not only_congress: sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml") else: # If --congress=X is specified, only look at the relevant years. sitemap_files = [ utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress) ] sitemap_files = [f for f in sitemap_files if os.path.exists(f)] # For each year-by-year BILLS sitemap... for year_sitemap in sitemap_files: dom = etree.parse(year_sitemap).getroot() if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.") # Loop through each bill text version... for file_node in dom.xpath("x:url", namespaces=ns): # get URL and last modified date url = str(file_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns)) # extract bill congress, type, number, and version from the URL m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url) if not m: raise Exception("Unmatched bill document URL: " + url) congress, bill_type, bill_number, version_code = m.groups() congress = int(congress) if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url) # If --congress=XXX is specified, only look at those bills. if only_congress and congress != only_congress: continue # Track the documents by congress, bill type, etc. bill_versions.setdefault(congress, {}).setdefault(bill_type, {}).setdefault(bill_number, {})[ version_code ] = {"url": url, "lastmod": lastmod} # Output the bill version info. We can't do this until the end because we need to get # the complete list of versions for a bill before we write the file, and the versions # may be split across multiple sitemap files. for congress in bill_versions: for bill_type in bill_versions[congress]: for bill_number in bill_versions[congress][bill_type]: utils.write( json.dumps( bill_versions[congress][bill_type][bill_number], sort_keys=True, indent=2, default=utils.format_datetime, ), output_for_bill(congress, bill_type, bill_number, "text-versions.json"), )
def update_bill_version_list(only_congress): bill_versions = { } # Which sitemap years should we look at? if not only_congress: sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml") else: # If --congress=X is specified, only look at the relevant years. sitemap_files = [utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress)] sitemap_files = [f for f in sitemap_files if os.path.exists(f)] # For each year-by-year BILLS sitemap... for year_sitemap in sitemap_files: dom = etree.parse(year_sitemap).getroot() if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.") # Loop through each bill text version... for file_node in dom.xpath("x:url", namespaces=ns): # get URL and last modified date url = str(file_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns)) # extract bill congress, type, number, and version from the URL m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url) if not m: raise Exception("Unmatched bill document URL: " + url) congress, bill_type, bill_number, version_code = m.groups() congress = int(congress) if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url) # If --congress=XXX is specified, only look at those bills. if only_congress and congress != only_congress: continue # Track the documents by congress, bill type, etc. bill_versions\ .setdefault(congress, { })\ .setdefault(bill_type, { })\ .setdefault(bill_number, { })\ [version_code] = { "url": url, "lastmod": lastmod, } # Output the bill version info. We can't do this until the end because we need to get # the complete list of versions for a bill before we write the file, and the versions # may be split across multiple sitemap files. for congress in bill_versions: for bill_type in bill_versions[congress]: for bill_number in bill_versions[congress][bill_type]: utils.write( json.dumps(bill_versions[congress][bill_type][bill_number], sort_keys=True, indent=2, default=utils.format_datetime), output_for_bill(congress, bill_type, bill_number, "text-versions.json") )
def update_sitemap(url, current_lastmod, how_we_got_here, options): """Updates the local cache of a sitemap file.""" # Skip if the year or congress flags are set and this sitemap is # not for that year or Congress. if should_skip_sitemap(url, options): return [] # For debugging, remember what URLs we are stepping through. how_we_got_here = how_we_got_here + [url] # Get the file paths to cache: # * the sitemap XML for future runs # * its <lastmod> date (which comes from the parent sitemap) so we know if we need to re-download it now # * the <lastmod> dates of the packages listed in this sitemap so we know if we need to re-download any package files cache_file = get_sitemap_cache_file(url) cache_file = os.path.join("govinfo/sitemap", cache_file, "sitemap.xml") lastmod_cache_file = cache_file.replace(".xml", "-lastmod.yaml") lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file) if not os.path.exists(lastmod_cache_file): lastmod_cache = { } else: with open(lastmod_cache_file) as f: lastmod_cache = rtyaml.load(f) try: return update_sitemap2(url, current_lastmod, how_we_got_here, options, lastmod_cache, cache_file) finally: # Write the updated last modified dates to disk so we know the next time whether # we need to fetch the files. If we didn't download anything, no need to write an # empty file. with utils.NoInterrupt(): with open(lastmod_cache_file, "w") as f: rtyaml.dump(lastmod_cache, f)
def update_sitemap(url, current_lastmod, how_we_got_here, options): """Updates the local cache of a sitemap file.""" # Skip if the year or congress flags are set and this sitemap is # not for that year or Congress. if should_skip_sitemap(url, options): return [] # For debugging, remember what URLs we are stepping through. how_we_got_here = how_we_got_here + [url] # Get the file paths to cache: # * the sitemap XML for future runs # * its <lastmod> date (which comes from the parent sitemap) so we know if we need to re-download it now # * the <lastmod> dates of the packages listed in this sitemap so we know if we need to re-download any package files cache_file = get_sitemap_cache_file(url) cache_file = os.path.join("govinfo/sitemap", cache_file, "sitemap.xml") lastmod_cache_file = cache_file.replace(".xml", "-lastmod.yaml") lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file) if not os.path.exists(lastmod_cache_file): lastmod_cache = {} else: with open(lastmod_cache_file) as f: lastmod_cache = rtyaml.load(f) try: return update_sitemap2(url, current_lastmod, how_we_got_here, options, lastmod_cache, cache_file) finally: # Write the updated last modified dates to disk so we know the next time whether # we need to fetch the files. If we didn't download anything, no need to write an # empty file. with utils.NoInterrupt(): with open(lastmod_cache_file, "w") as f: rtyaml.dump(lastmod_cache, f)
def get_sitemap(year, collection, lastmod, options): """Gets a single sitemap, downloading it if the sitemap has changed. Downloads the root sitemap (year==None, collection==None), or the sitemap for a year (collection==None), or the sitemap for a particular year and collection. Pass lastmod which is the current modification time of the file according to its parent sitemap, which is how it knows to return a cached copy. Returns the sitemap parsed into a DOM. """ # Construct the URL and the path to where to cache the file on disk. if year == None: url = "http://www.gpo.gov/smap/fdsys/sitemap.xml" path = "fdsys/sitemap/sitemap.xml" elif collection == None: url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year) path = "fdsys/sitemap/%s/sitemap.xml" % year else: url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection) path = "fdsys/sitemap/%s/%s.xml" % (year, collection) # Should we re-download the file? lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt") if options.get("cached", False): # If --cached is used, don't hit the network. force = False elif not lastmod: # No *current* lastmod date is known for this file (because it is the master # sitemap file, probably), so always download. force = True else: # If the file is out of date or --force is used, download the file. cache_lastmod = utils.read(lastmod_cache_file) force = (lastmod != cache_lastmod) or options.get("force", False) if force: logging.warn("Downloading: %s" % url) body = utils.download(url, path, utils.merge(options, { 'force': force, 'binary': True })) if not body: raise Exception("Failed to download %s" % url) # Write the current last modified date to disk so we know the next time whether # we need to fetch the file. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file) try: return etree.fromstring(body) except etree.XMLSyntaxError as e: raise Exception("XML syntax error in %s: %s" % (url, str(e)))
def save_bill_search_state(saved_bills, search_state): # For --fast mode, cache the current search result listing (in search_state) # to disk so we can detect major changes to the bill through the search # listing rather than having to parse the bill. for bill_id in saved_bills: if bill_id in search_state: fast_cache_path = utils.cache_dir() + "/" + bill_info.bill_cache_for(bill_id, "search_result.html") new_state = search_state[bill_id] utils.write(new_state, fast_cache_path)
def save_bill_search_state(saved_bills, search_state): # For --fast mode, cache the current search result listing (in search_state) # to disk so we can detect major changes to the bill through the search # listing rather than having to parse the bill. for bill_id in saved_bills: if bill_id in search_state: fast_cache_path = utils.cache_dir( ) + "/" + bill_info.bill_cache_for(bill_id, "search_result.html") new_state = search_state[bill_id] utils.write(new_state, fast_cache_path)
def run(options): # Download the TSV file. cache_zip_path = "adler-wilkerson-bills.zip" utils.download( "http://congressionalbills.org/billfiles/bills80-92.zip", cache_zip_path, utils.merge(options, {'binary': True, 'needs_content': False})) # Unzip in memory and process the records. zfile = zipfile.ZipFile(utils.cache_dir() + "/" + cache_zip_path) csvreader = csv.DictReader(zfile.open("bills80-92.txt"), delimiter="\t") for record in csvreader: rec = process_bill(record) import pprint pprint.pprint(rec)
def get_sitemap(year, collection, lastmod, options): # Construct the URL and the path to where to cache the file on disk. if year == None: url = "http://www.gpo.gov/smap/fdsys/sitemap.xml" path = "fdsys/sitemap/sitemap.xml" elif collection == None: url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year) path = "fdsys/sitemap/%s/sitemap.xml" % year else: url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection) path = "fdsys/sitemap/%s/%s.xml" % (year, collection) # Should we re-download the file? lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt") if options.get("cached", False): # If --cached is used, don't hit the network. force = False elif not lastmod: # No *current* lastmod date is known for this file (because it is the master # sitemap file, probably), so always download. force = True else: # If the file is out of date or --force is used, download the file. cache_lastmod = utils.read(lastmod_cache_file) force = (lastmod != cache_lastmod) or options.get("force", False) if force: logging.warn("Downloading: %s" % url) body = utils.download(url, path, utils.merge(options, {"force": force, "xml": True})) if not body: raise Exception("Failed to download %s" % url) # Write the current last modified date to disk so we know the next time whether # we need to fetch the file. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file) return etree.fromstring(body)
def run(): # Field mapping. And which fields should be turned into integers. # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available. fieldmap = { "congbio": "bioguide", #"fec": "fec", # handled specially... "govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors) "opensecrets": "opensecrets", "votesmart": "votesmart", "cspan": "cspan", } int_fields = ("govtrack", "votesmart", "cspan") # default to not caching cache = utils.flags().get('cache', False) # Load legislator files and map bioguide IDs. y1 = utils.load_data("legislators-current.yaml") y2 = utils.load_data("legislators-historical.yaml") bioguides = {} for y in y1 + y2: bioguides[y["id"]["bioguide"]] = y # Okay now the Wikipedia stuff... def get_matching_pages(): # Does a Wikipedia API search for pages containing either of the # two templates. Returns the pages. page_titles = set() for template in ("CongLinks", "CongBio"): eicontinue = "" while True: # construct query URL, using the "eicontinue" of the last query to get the next batch url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template if eicontinue: url += "&eicontinue=" + eicontinue # load the XML print("Getting %s pages (%d...)" % (template, len(page_titles))) dom = lxml.etree.fromstring(utils.download( url, None, True)) # can't cache eicontinue probably for pgname in dom.xpath("query/embeddedin/ei/@title"): page_titles.add(pgname) # get the next eicontinue value and loop eicontinue = dom.xpath( "string(query-continue/embeddedin/@eicontinue)") if not eicontinue: break return page_titles # Get the list of Wikipedia pages that use any of the templates we care about. page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles") if cache and os.path.exists(page_list_cache_file): # Load from cache. matching_pages = open(page_list_cache_file).read().split("\n") else: # Query Wikipedia API and save to cache. matching_pages = get_matching_pages() utils.write(("\n".join(matching_pages)), page_list_cache_file) # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon). matching_pages = [p for p in matching_pages if ":" not in p] # Load each page's content and parse the template. for p in sorted(matching_pages): if " campaign" in p: continue if " (surname)" in p: continue if "career of " in p: continue if "for Congress" in p: continue if p.startswith("List of "): continue if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue # Query the Wikipedia API to get the raw page content in XML, # and then use XPath to get the raw page text. url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote( p.encode("utf8")) + "&export&exportnowrap" cache_path = "legislators/wikipedia/pages/" + p dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache)) page_content = dom.xpath( "string(mw:page/mw:revision/mw:text)", namespaces={"mw": "http://www.mediawiki.org/xml/export-0.8/"}) # Build a dict for the IDs that we want to insert into our files. new_ids = { "wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores) } if "CongLinks" in page_content: # Parse the key/val pairs in the template. m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content) if not m: continue # no template? for arg in m.group(1).split("|"): if "=" not in arg: continue key, val = arg.split("=", 1) key = key.strip() val = val.strip() if val and key in fieldmap: try: if fieldmap[key] in int_fields: val = int(val) except ValueError: print("invalid value", key, val) continue if key == "opensecrets": val = val.replace("&newMem=Y", "").replace( "&newmem=Y", "").replace("&cycle=2004", "").upper() new_ids[fieldmap[key]] = val if "bioguide" not in new_ids: continue new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm bioguide = new_ids["bioguide"] else: m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content) if not m: continue # no template? bioguide = m.group(1).upper() if not bioguide in bioguides: print( "Member not found: " + bioguide, p, "(Might have been a delegate to the Constitutional Convention.)" ) continue # handle FEC ids specially because they are stored in an array... fec_id = new_ids.get("fec") if fec_id: del new_ids["fec"] member = bioguides[bioguide] member["id"].update(new_ids) # ...finish the FEC id. if fec_id: if fec_id not in bioguides[bioguide]["id"].get("fec", []): bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id) #print p.encode("utf8"), new_ids utils.save_data(y1, "legislators-current.yaml") utils.save_data(y2, "legislators-historical.yaml")
def update_sitemap(url, current_lastmod, how_we_got_here, options, listing): """Updates the local cache of a sitemap file.""" # What is this sitemap for? subject = extract_sitemap_subject_from_url(url, how_we_got_here) # For debugging, remember what URLs we are stepping through. how_we_got_here = how_we_got_here + [url] # Does the user want to process this sitemap? if skip_sitemap(subject, options): return # Where to cache the sitemap and a file where we store its current <lastmod> date # (which comes from a parent sitemap)? (cache_file, lastmod_cache_file) = get_sitemap_cache_files(subject) lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file) # Download anew if the current_lastmod doesn't match the stored lastmod # in our cache, and if --cache is not specified. Or if --force is given. # If we're not downloading it, load it from disk because we still have # to process each sitemap to ensure we've downloaded all of the package # files the user wants. download = should_download_sitemap(lastmod_cache_file, current_lastmod, options) # Download, or just retreive from cache. if download: logging.warn("Downloading: %s" % url) body = utils.download( url, cache_file, utils.merge(options, { 'force': download, 'binary': True })) if not body: raise Exception("Failed to download %s" % url) # Write the current last modified date to disk so we know the next time whether # we need to fetch the file --- if we just downloaded it. if download and current_lastmod: utils.write(current_lastmod, lastmod_cache_file) # Load the XML. try: sitemap = etree.fromstring(body) except etree.XMLSyntaxError as e: raise Exception("XML syntax error in %s: %s" % (url, str(e))) # Process the entries. if sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": # This is a sitemap index. Process the sitemaps listed in this # sitemapindex recursively. for node in sitemap.xpath("x:sitemap", namespaces=ns): # Get URL and lastmod date of the sitemap. url = str(node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns)) update_sitemap(url, lastmod, how_we_got_here, options, listing) elif sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": # This is a regular sitemap with content items listed. # For the --list command, remember that this sitemap had some data. # And then return --- don't download any package files. if options.get("list"): listing.append(subject) return # Process the items. for node in sitemap.xpath("x:url", namespaces=ns): url = str(node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns)) if not subject.get("bulkdata"): # This is a regular collection item. # # Get the "package" name, i.e. a particular document (which has # one or more file formats within it). m = re.match("https://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url) if not m: raise Exception("Unmatched package URL (%s) at %s." % (url, "->".join(how_we_got_here))) package_name = m.group(1) if options.get("filter") and not re.search(options["filter"], package_name): continue mirror_package(subject, package_name, lastmod, url, options) else: # This is a bulk data item. Extract components of the URL. m = re.match(re.escape(BULKDATA_BASE_URL) + re.escape(subject["collection"]) + "/(.+)", url) if not m: raise Exception("Unmatched bulk data file URL (%s) at %s." % (url, "->".join(how_we_got_here))) item_path = m.group(1) if options.get("filter") and not re.search(options["filter"], item_path): continue mirror_bulkdata_file(subject, url, item_path, lastmod, options) else: raise Exception("Unknown sitemap type (%s) at the root sitemap of %s." % (sitemap.tag, url))
def bill_ids_for(congress, options, bill_states={}): # override if we're actually using this method to get amendments doing_amendments = options.get('amendments', False) bill_ids = [] bill_type = options.get( 'amendment_type' if doing_amendments else 'bill_type', None) if bill_type: bill_types = [bill_type] else: bill_types = utils.thomas_types.keys() for bill_type in bill_types: # This sub is re-used for pulling amendment IDs too. if (bill_type in ('samdt', 'hamdt', 'supamdt')) != doing_amendments: continue # match only links to landing pages of this bill type # it shouldn't catch stray links outside of the confines of the 100 on the page, # but if it does, no big deal link_pattern = "^\s*%s\d+\s*$" % utils.thomas_types[bill_type][1] # loop through pages and collect the links on each page until # we hit a page with < 100 results, or no results offset = 0 while True: # download page, find the matching links page = utils.download(page_for(congress, bill_type, offset), page_cache_for(congress, bill_type, offset), options) if not page: logging.error( "Couldn't download page with offset %i, aborting" % offset) return None # extract matching links doc = html.document_fromstring(page) links = doc.xpath( "//a[re:match(text(), '%s')]" % link_pattern, namespaces={"re": "http://exslt.org/regular-expressions"}) # extract the bill ID from each link for link in links: code = link.text.lower().replace(".", "").replace(" ", "") bill_id = "%s-%s" % (code, congress) if options.get("fast", False): fast_cache_path = utils.cache_dir( ) + "/" + bill_info.bill_cache_for(bill_id, "search_result.html") old_state = utils.read(fast_cache_path) # Compare all of the output in the search result's <p> tag, which # has last major action, number of cosponsors, etc. to a cache on # disk to see if any major information about the bill changed. parent_node = link.getparent( ) # the <p> tag containing the whole search hit parent_node.remove( parent_node.xpath("b")[0] ) # remove the <b>###.</b> node that isn't relevant for comparison new_state = etree.tostring( parent_node) # serialize this tag if old_state == new_state: logging.info("No change in search result listing: %s" % bill_id) continue bill_states[bill_id] = new_state bill_ids.append(bill_id) if len(links) < 100: break offset += 100 # sanity check, while True loops are dangerous if offset > 100000: break return utils.uniq(bill_ids)
# load the XML print "Getting %s pages (%d...)" % (template, len(page_titles)) dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably for pgname in dom.xpath("query/embeddedin/ei/@title"): page_titles.add(pgname) # get the next eicontinue value and loop eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)") if not eicontinue: break return page_titles # Get the list of Wikipedia pages that use any of the templates we care about. page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles") if cache and os.path.exists(page_list_cache_file): # Load from cache. matching_pages = open(page_list_cache_file).read().decode("utf8").split("\n") else: # Query Wikipedia API and save to cache. matching_pages = get_matching_pages() utils.write((u"\n".join(matching_pages)).encode("utf8"), page_list_cache_file) # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon). matching_pages = [p for p in matching_pages if ":" not in p] # Load each page's content and parse the template. for p in sorted(matching_pages): if " campaign" in p: continue if " (surname)" in p: continue
def mirror_packages(fetch_collections, options): """Create a local mirror of FDSys document files. Only downloads changed files, according to the sitemap. Run update_sitemap_cache first. Pass fetch_collections as None, or to restrict the update to particular FDSys collections a set of collection names. Set options["store"] to a comma-separated list of file types (pdf, mods, text, xml, zip). """ # For determining whether we need to process a sitemap file again on a later # run, we need to make a key out of the command line arguments that affect # which files we are downloading. cache_options_key = repr(tuple(sorted(kv for kv in options.items() if kv[0] in ("store", "year", "congress", "granules", "cached")))) file_types = options["store"].split(",") # Process each FDSys sitemap... for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")): # Should we process this file? year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups() if "year" in options and year != options["year"]: continue if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])): continue if fetch_collections and collection not in fetch_collections: continue # Has this sitemap changed since the last successful mirror? # # The sitemap's last modification time is stored in ...-lastmod.txt, # which comes from the sitemap's parent sitemap's lastmod listing for # the file. # # Compare that to the lastmod value of when we last did a successful mirror. # This function can be run to fetch different sets of files, so get the # lastmod value corresponding to the current run arguments. sitemap_store_state_file = re.sub(r"\.xml$", "-store-state.json", sitemap) sitemap_last_mod = open(re.sub(r"\.xml$", "-lastmod.txt", sitemap)).read() if os.path.exists(sitemap_store_state_file): sitemap_store_state = json.load(open(sitemap_store_state_file)) if sitemap_store_state.get(cache_options_key) == sitemap_last_mod: # sitemap hasn't changed since the last time continue logging.info("scanning " + sitemap + "...") # Load the sitemap for this year & collection, and loop through each document. for package_name, lastmod in get_sitemap_entries(sitemap): # Add this package to the download list. file_list = [] if not options.get("granules", False): # Doing top-level package files (granule==None). file_list.append(None) else: # In some collections, like STATUTE, each document has subparts which are not # described in the sitemap. Load the main HTML page and scrape for the sub-files. # In the STATUTE collection, the MODS information in granules is redundant with # information in the top-level package MODS file. But the only way to get granule- # level PDFs is to go through the granules. content_detail_url = "http://www.gpo.gov/fdsys/pkg/%s/content-detail.html" % package_name content_index = utils.download(content_detail_url, "fdsys/package/%s/%s/%s.html" % (year, collection, package_name), utils.merge(options, { 'binary': True, })) if not content_index: raise Exception("Failed to download %s" % content_detail_url) for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"): if link.text == "More": m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href")) if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href")) granule_name = m.group(2) file_list.append(granule_name) # Download the files of the desired types. for granule_name in file_list: mirror_package(year, collection, package_name, lastmod, granule_name, file_types, options) # If we got this far, we successfully downloaded all of the files in this year/collection. # To speed up future updates, save the lastmod time of this sitemap in a file indicating # what we downloaded. The store-state file contains a JSON mapping of command line options # to the most recent lastmod value for this sitemap. sitemap_store_state = { } if os.path.exists(sitemap_store_state_file): sitemap_store_state = json.load(open(sitemap_store_state_file)) sitemap_store_state[cache_options_key] = sitemap_last_mod json.dump(sitemap_store_state, open(sitemap_store_state_file, "w"))
def run(): # Field mapping. And which fields should be turned into integers. # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available. fieldmap = { "congbio": "bioguide", #"fec": "fec", # handled specially... "govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors) "opensecrets": "opensecrets", "votesmart": "votesmart", "cspan": "cspan", } int_fields = ("govtrack", "votesmart", "cspan") # default to not caching cache = utils.flags().get('cache', False) # Load legislator files and map bioguide IDs. y1 = utils.load_data("legislators-current.yaml") y2 = utils.load_data("legislators-historical.yaml") bioguides = { } for y in y1+y2: bioguides[y["id"]["bioguide"]] = y # Okay now the Wikipedia stuff... def get_matching_pages(): # Does a Wikipedia API search for pages containing either of the # two templates. Returns the pages. page_titles = set() for template in ("CongLinks", "CongBio"): eicontinue = "" while True: # construct query URL, using the "eicontinue" of the last query to get the next batch url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template if eicontinue: url += "&eicontinue=" + eicontinue # load the XML print("Getting %s pages (%d...)" % (template, len(page_titles))) dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably for pgname in dom.xpath("query/embeddedin/ei/@title"): page_titles.add(pgname) # get the next eicontinue value and loop eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)") if not eicontinue: break return page_titles # Get the list of Wikipedia pages that use any of the templates we care about. page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles") if cache and os.path.exists(page_list_cache_file): # Load from cache. matching_pages = open(page_list_cache_file).read().split("\n") else: # Query Wikipedia API and save to cache. matching_pages = get_matching_pages() utils.write(("\n".join(matching_pages)), page_list_cache_file) # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon). matching_pages = [p for p in matching_pages if ":" not in p] # Load each page's content and parse the template. for p in sorted(matching_pages): if " campaign" in p: continue if " (surname)" in p: continue if "career of " in p: continue if "for Congress" in p: continue if p.startswith("List of "): continue if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue # Query the Wikipedia API to get the raw page content in XML, # and then use XPath to get the raw page text. url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap" cache_path = "legislators/wikipedia/pages/" + p dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache)) page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" }) # Build a dict for the IDs that we want to insert into our files. new_ids = { "wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores) } if "CongLinks" in page_content: # Parse the key/val pairs in the template. m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content) if not m: continue # no template? for arg in m.group(1).split("|"): if "=" not in arg: continue key, val = arg.split("=", 1) key = key.strip() val = val.strip() if val and key in fieldmap: try: if fieldmap[key] in int_fields: val = int(val) except ValueError: print("invalid value", key, val) continue if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper() new_ids[fieldmap[key]] = val if "bioguide" not in new_ids: continue new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm bioguide = new_ids["bioguide"] else: m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content) if not m: continue # no template? bioguide = m.group(1).upper() if not bioguide in bioguides: print("Member not found: " + bioguide, p.encode("utf8"), "(Might have been a delegate to the Constitutional Convention.)") continue # handle FEC ids specially because they are stored in an array... fec_id = new_ids.get("fec") if fec_id: del new_ids["fec"] member = bioguides[bioguide] member["id"].update(new_ids) # ...finish the FEC id. if fec_id: if fec_id not in bioguides[bioguide]["id"].get("fec", []): bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id) #print p.encode("utf8"), new_ids utils.save_data(y1, "legislators-current.yaml") utils.save_data(y2, "legislators-historical.yaml")
dom = lxml.etree.fromstring(utils.download( url, None, True)) # can't cache eicontinue probably for pgname in dom.xpath("query/embeddedin/ei/@title"): page_titles.add(pgname) # get the next eicontinue value and loop eicontinue = dom.xpath( "string(query-continue/embeddedin/@eicontinue)") if not eicontinue: break return page_titles # Get the list of Wikipedia pages that use any of the templates we care about. page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles") if cache and os.path.exists(page_list_cache_file): # Load from cache. matching_pages = open(page_list_cache_file).read().decode("utf8").split( "\n") else: # Query Wikipedia API and save to cache. matching_pages = get_matching_pages() utils.write((u"\n".join(matching_pages)).encode("utf8"), page_list_cache_file) # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon). matching_pages = [p for p in matching_pages if ":" not in p] # Load each page's content and parse the template.
def bill_ids_for(congress, options, bill_states={}): # override if we're actually using this method to get amendments doing_amendments = options.get('amendments', False) bill_ids = [] bill_type = options.get('amendment_type' if doing_amendments else 'bill_type', None) if bill_type: bill_types = [bill_type] else: bill_types = utils.thomas_types.keys() for bill_type in bill_types: # This sub is re-used for pulling amendment IDs too. if (bill_type in ('samdt', 'hamdt', 'supamdt')) != doing_amendments: continue # match only links to landing pages of this bill type # it shouldn't catch stray links outside of the confines of the 100 on the page, # but if it does, no big deal link_pattern = "^\s*%s\d+\s*$" % utils.thomas_types[bill_type][1] # loop through pages and collect the links on each page until # we hit a page with < 100 results, or no results offset = 0 while True: # download page, find the matching links page = utils.download( page_for(congress, bill_type, offset), page_cache_for(congress, bill_type, offset), options) if not page: logging.error("Couldn't download page with offset %i, aborting" % offset) return None # extract matching links doc = html.document_fromstring(page) links = doc.xpath( "//a[re:match(text(), '%s')]" % link_pattern, namespaces={"re": "http://exslt.org/regular-expressions"}) # extract the bill ID from each link for link in links: code = link.text.lower().replace(".", "").replace(" ", "") bill_id = "%s-%s" % (code, congress) if options.get("fast", False): fast_cache_path = utils.cache_dir() + "/" + bill_info.bill_cache_for(bill_id, "search_result.html") old_state = utils.read(fast_cache_path) # Compare all of the output in the search result's <p> tag, which # has last major action, number of cosponsors, etc. to a cache on # disk to see if any major information about the bill changed. parent_node = link.getparent() # the <p> tag containing the whole search hit parent_node.remove(parent_node.xpath("b")[0]) # remove the <b>###.</b> node that isn't relevant for comparison new_state = etree.tostring(parent_node) # serialize this tag if old_state == new_state: logging.info("No change in search result listing: %s" % bill_id) continue bill_states[bill_id] = new_state bill_ids.append(bill_id) if len(links) < 100: break offset += 100 # sanity check, while True loops are dangerous if offset > 100000: break return utils.uniq(bill_ids)
def cache(inspector, path): return os.path.join(utils.cache_dir(), inspector, path)
def update_sitemap(url, current_lastmod, how_we_got_here, options, listing): """Updates the local cache of a sitemap file.""" # Return a list of files we downloaded. results = [] # What is this sitemap for? subject = extract_sitemap_subject_from_url(url, how_we_got_here) # For debugging, remember what URLs we are stepping through. how_we_got_here = how_we_got_here + [url] # Does the user want to process this sitemap? if skip_sitemap(subject, options): return # Where to cache the sitemap and a file where we store its current <lastmod> date # (which comes from a parent sitemap)? (cache_file, lastmod_cache_file) = get_sitemap_cache_files(subject) lastmod_cache_file = os.path.join(utils.cache_dir(), lastmod_cache_file) # Download anew if the current_lastmod doesn't match the stored lastmod # in our cache, and if --cache is not specified. Or if --force is given. # If we're not downloading it, load it from disk because we still have # to process each sitemap to ensure we've downloaded all of the package # files the user wants. download = should_download_sitemap(lastmod_cache_file, current_lastmod, options) # Download, or just retreive from cache. if download: logging.warn("Downloading: %s" % url) body = utils.download( url, cache_file, utils.merge(options, { 'force': download, 'binary': True })) if not body: raise Exception("Failed to download %s" % url) # Write the current last modified date to disk so we know the next time whether # we need to fetch the file --- if we just downloaded it. if download and current_lastmod: utils.write(current_lastmod, lastmod_cache_file) # Load the XML. try: sitemap = etree.fromstring(body) except etree.XMLSyntaxError as e: raise Exception("XML syntax error in %s: %s" % (url, str(e))) # Process the entries. if sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": # This is a sitemap index. Process the sitemaps listed in this # sitemapindex recursively. for node in sitemap.xpath("x:sitemap", namespaces=ns): # Get URL and lastmod date of the sitemap. url = str(node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns)) sitemap_results = update_sitemap(url, lastmod, how_we_got_here, options, listing) if sitemap_results is not None: results = results + sitemap_results elif sitemap.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": # This is a regular sitemap with content items listed. # For the --list command, remember that this sitemap had some data. # And then return --- don't download any package files. if options.get("list"): listing.append(subject) return # Process the items. for node in sitemap.xpath("x:url", namespaces=ns): url = str(node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(node.xpath("string(x:lastmod)", namespaces=ns)) if not subject.get("bulkdata"): # This is a regular collection item. # # Get the "package" name, i.e. a particular document (which has # one or more file formats within it). m = re.match("https://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url) if not m: raise Exception("Unmatched package URL (%s) at %s." % (url, "->".join(how_we_got_here))) package_name = m.group(1) if options.get("filter") and not re.search(options["filter"], package_name): continue mirror_results = mirror_package(subject, package_name, lastmod, url, options) if mirror_results is not None and len(mirror_results) > 0: results = results + mirror_results else: # This is a bulk data item. Extract components of the URL. m = re.match(re.escape(BULKDATA_BASE_URL) + re.escape(subject["collection"]) + "/(.+)", url) if not m: raise Exception("Unmatched bulk data file URL (%s) at %s." % (url, "->".join(how_we_got_here))) item_path = m.group(1) if options.get("filter") and not re.search(options["filter"], item_path): continue mirror_results = mirror_bulkdata_file(subject, url, item_path, lastmod, options) if mirror_results is not None and len(mirror_results) > 0: results = results + mirror_results else: raise Exception("Unknown sitemap type (%s) at the root sitemap of %s." % (sitemap.tag, url)) return results
def mirror_files(fetch_collections, options): # Locally mirror certain file types for the specified collections. file_types = options["store"].split(",") for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")): # Should we process this file? year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups() if "year" in options and year != options["year"]: continue if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])): continue if fetch_collections and collection not in fetch_collections: continue logging.warn(sitemap + "...") # Load the sitemap for this year & collection. dom = etree.parse(sitemap).getroot() if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.") # Loop through each document in the collection in this year... for file_node in dom.xpath("x:url", namespaces=ns): # Get URL and last modified timestamp. url = str(file_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns)) if not url.endswith("/content-detail.html"): raise Exception("Unrecognized file pattern.") # Get the package name. m = re.match("http://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url) if not m: raise Exception("Unmatched document URL") package_name = m.group(1) # Where to store the document files? # The path will depend a bit on the collection. if collection == "BILLS": # Store with the other bill data. m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url) if not m: raise Exception("Unmatched bill document URL: " + url) congress, bill_type, bill_number, version_code = m.groups() congress = int(congress) if "congress" in options and congress != int(options["congress"]): continue path = output_for_bill(congress, bill_type, bill_number, "text-versions/" + version_code) else: # Store in fdsys/COLLECTION/YEAR/PKGNAME. path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name) # Do we need to update this record? lastmod_cache_file = path + "/lastmod.txt" cache_lastmod = utils.read(lastmod_cache_file) force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False) # Add this package to the download list. file_list = [] file_list.append((None, path)) if options.get("granules", False): # In some collections, like STATUTE, each document has subparts which are not # described in the sitemap. Load the main HTML page and scrape for the sub-files. # Josh originally thought the STATUTE granule files (individual statutes) were # useful, but then it turned out the information is redudant with information # in the top-level package MODS file. content_index = utils.download( url, "fdsys/package/%s/%s/%s.html" % (year, collection, package_name), utils.merge( options, { "xml": True, # it's not XML but this avoid unescaping HTML which fails if there are unicode characters "force": force, }, ), ) if not content_index: raise Exception("Failed to download %s" % url) for link in html.fromstring(content_index).cssselect( "table.page-details-data-table td.rightLinkCell a" ): if link.text == "More": m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href")) if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href")) granule_name = m.group(2) file_list.append((granule_name, path + "/" + granule_name)) # Download the files of the desired types. for granule_name, path in file_list: targets = get_package_files(package_name, granule_name, path) for file_type in file_types: if file_type not in targets: raise Exception("Invalid file type: %s" % file_type) f_url, f_path = targets[file_type] if force: logging.warn(f_path) data = utils.download( f_url, f_path, utils.merge(options, {"xml": True, "force": force, "to_cache": False}) ) if not data: raise Exception("Failed to download %s" % url) # Write the current last modified date to disk so we know the next time whether # we need to fetch the file. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file)