Exemple #1
0
def update_bill_version_list(only_congress):
  bill_versions = { }
  
  # Which sitemap years should we look at?
  if not only_congress:
    sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml")
  else:
    # If --congress=X is specified, only look at the relevant years.
    sitemap_files = [utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress)]
    sitemap_files = [f for f in sitemap_files if os.path.exists(f)]
  
  # For each year-by-year BILLS sitemap...
  for year_sitemap in sitemap_files:
    dom = etree.parse(year_sitemap).getroot()
    if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.")
    
    # Loop through each bill text version...
    for file_node in dom.xpath("x:url", namespaces=ns):
      # get URL and last modified date
      url = str(file_node.xpath("string(x:loc)", namespaces=ns))
      lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))
      
      # extract bill congress, type, number, and version from the URL
      m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
      if not m: raise Exception("Unmatched bill document URL: " + url)
      congress, bill_type, bill_number, version_code = m.groups()
      congress = int(congress)
      if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url)
      
      # If --congress=XXX is specified, only look at those bills. 
      if only_congress and congress != only_congress:
        continue
      
      # Track the documents by congress, bill type, etc.
      bill_versions\
        .setdefault(congress, { })\
        .setdefault(bill_type, { })\
        .setdefault(bill_number, { })\
        [version_code] = {
          "url": url,
          "lastmod": lastmod,
        }
        
  # Output the bill version info. We can't do this until the end because we need to get
  # the complete list of versions for a bill before we write the file, and the versions
  # may be split across multiple sitemap files.
  
  for congress in bill_versions:
    for bill_type in bill_versions[congress]:
      for bill_number in bill_versions[congress][bill_type]:
        utils.write(
          json.dumps(bill_versions[congress][bill_type][bill_number],
            sort_keys=True, indent=2, default=utils.format_datetime), 
          output_for_bill(congress, bill_type, bill_number, "text-versions.json")
        )
Exemple #2
0
def get_output_path(year, collection, package_name, granule_name, options):
  # Where to store the document files?
  # The path will depend a bit on the collection.
  if collection == "BILLS":
    # Store with the other bill data.
    bill_id, version_code = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress"))
    if not bill_id: return None # congress number does not match options["congress"]
    return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False)
  else:
    # Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME].
    path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name)
    if granule_name: path += "/" + granule_name
    return path
Exemple #3
0
def get_output_path(year, collection, package_name, granule_name, options):
  # Where to store the document files?
  # The path will depend a bit on the collection.
  if collection == "BILLS":
    # Store with the other bill data.
    bill_and_ver = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress"))
    if not bill_and_ver: return None # congress number does not match options["congress"]
    bill_id, version_code = bill_and_ver
    return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False)
  else:
    # Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME].
    path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name)
    if granule_name: path += "/" + granule_name
    return path
Exemple #4
0
def run(options):
    amendment_id = options.get('amendment_id', None)
    bill_id = options.get('bill_id', None)

    search_state = {}

    if amendment_id:
        amendment_type, number, congress = utils.split_bill_id(amendment_id)
        to_fetch = [amendment_id]

    elif bill_id:
        # first, crawl the bill
        bill_type, number, congress = utils.split_bill_id(bill_id)
        bill_status = fetch_bill(bill_id, options)
        if bill_status['ok']:
            bill = json.loads(utils.read(output_for_bill(bill_id, "json")))
            to_fetch = [x["amendment_id"] for x in bill["amendments"]]
        else:
            logging.error("Couldn't download information for that bill.")
            return None

    else:
        congress = options.get('congress', utils.current_congress())

        to_fetch = bill_ids_for(congress,
                                utils.merge(options, {'amendments': True}),
                                bill_states=search_state)
        if not to_fetch:
            if options.get("fast", False):
                logging.warn("No amendments changed.")
            else:
                logging.error(
                    "Error figuring out which amendments to download, aborting."
                )

            return None

        limit = options.get('limit', None)
        if limit:
            to_fetch = to_fetch[:int(limit)]

    if options.get('pages_only', False):
        return None

    logging.warn("Going to fetch %i amendments from congress #%s" %
                 (len(to_fetch), congress))
    saved_amendments = utils.process_set(to_fetch, fetch_amendment, options)

    # keep record of the last state of all these amendments, for later fast-searching
    save_bill_search_state(saved_amendments, search_state)
Exemple #5
0
def run(options):
  amendment_id = options.get('amendment_id', None)
  bill_id = options.get('bill_id', None)
  
  search_state = { }

  if amendment_id:
    amendment_type, number, congress = utils.split_bill_id(amendment_id)
    to_fetch = [amendment_id]

  elif bill_id:
    # first, crawl the bill
    bill_type, number, congress = utils.split_bill_id(bill_id)
    bill_status = fetch_bill(bill_id, options)
    if bill_status['ok']:
      bill = json.loads(utils.read(output_for_bill(bill_id, "json")))
      to_fetch = [x["amendment_id"] for x in bill["amendments"]]
    else:
      logging.error("Couldn't download information for that bill.")
      return None

  else:
    congress = options.get('congress', utils.current_congress())

    to_fetch = bill_ids_for(congress, utils.merge(options, {'amendments': True}), bill_states=search_state)
    if not to_fetch:
      if options.get("fast", False):
        logging.warn("No amendments changed.")
      else:
        logging.error("Error figuring out which amendments to download, aborting.")

      return None

    limit = options.get('limit', None)
    if limit:
      to_fetch = to_fetch[:int(limit)]

  if options.get('pages_only', False):
    return None

  logging.warn("Going to fetch %i amendments from congress #%s" % (len(to_fetch), congress))
  saved_amendments = utils.process_set(to_fetch, fetch_amendment, options)

  # keep record of the last state of all these amendments, for later fast-searching
  save_bill_search_state(saved_amendments, search_state)