Example #1
0
def mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options):
  # Where should we store the file?
  path = get_output_path(year, collection, package_name, granule_name, options)
  if not path: return # should skip
  
  # Do we need to update this record?
  lastmod_cache_file = path + "/lastmod.txt"
  cache_lastmod = utils.read(lastmod_cache_file)
  force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False)
  
  # Try downloading files for each file type.
  targets = get_package_files(package_name, granule_name, path)
  updated_file_types = set()
  for file_type in file_types:
    if file_type not in targets: raise Exception("Invalid file type: %s" % file_type)
    f_url, f_path = targets[file_type]
    
    if (not force) and os.path.exists(f_path): continue # we already have the current file
    logging.warn("Downloading: " + f_path)
    data = utils.download(f_url, f_path, utils.merge(options, {
      'binary': True, 
      'force': force, 
      'to_cache': False,
      'needs_content': file_type == "text" and f_path.endswith(".html"),
    }))
    updated_file_types.add(file_type)
    
    if not data:
      if file_type == "pdf":
        # expected to be present for all packages
        raise Exception("Failed to download %s" % package_name)
      else:
        # not all packages have all file types, but assume this is OK
        logging.error("file not found: " + f_url)
        continue
    
    if file_type == "text" and f_path.endswith(".html"):
      # The "text" format files are put in an HTML container. Unwrap it into a .txt file.
      # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
      #       html.fromstring does auto-detection.
      with open(f_path[0:-4] + "txt", "w") as f:
        text_content = unicode(html.fromstring(data).text_content())
        f.write(text_content.encode("utf8"))
        
  if collection == "BILLS" and "mods" in updated_file_types:
    # When we download bill files, also create the text-versions/data.json file
    # which extracts commonly used components of the MODS XML.
    from bill_versions import write_bill_version_metadata
    write_bill_version_metadata(get_bill_id_for_package(package_name, with_version=True))

  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the files for this sitemap item.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file) 
Example #2
0
def mirror_package(year, collection, package_name, lastmod, granule_name, file_types, options):
  # Where should we store the file?
  path = get_output_path(year, collection, package_name, granule_name, options)
  if not path: return # should skip
  
  # Do we need to update this record?
  lastmod_cache_file = path + "/lastmod.txt"
  cache_lastmod = utils.read(lastmod_cache_file)
  force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False)
  
  # Try downloading files for each file type.
  targets = get_package_files(package_name, granule_name, path)
  updated_file_types = set()
  for file_type in file_types:
    if file_type not in targets: raise Exception("Invalid file type: %s" % file_type)
    
    # For BILLS, XML was not available until the 108th Congress, though even after that
    # it was spotty until the 111th or so Congress.
    if file_type == "xml" and collection == "BILLS" and int(package_name[6:9]) < 108:
      continue
    
    f_url, f_path = targets[file_type]
    
    if (not force) and os.path.exists(f_path): continue # we already have the current file
    logging.warn("Downloading: " + f_path)
    data = utils.download(f_url, f_path, utils.merge(options, {
      'binary': True, 
      'force': force, 
      'to_cache': False,
      'needs_content': file_type == "text" and f_path.endswith(".html"),
    }))
    updated_file_types.add(file_type)
    
    if not data:
      if file_type in ("pdf", "zip"):
        # expected to be present for all packages
        raise Exception("Failed to download %s" % package_name)
      else:
        # not all packages have all file types, but assume this is OK
        logging.error("file not found: " + f_url)
        continue
    
    if file_type == "text" and f_path.endswith(".html"):
      # The "text" format files are put in an HTML container. Unwrap it into a .txt file.
      # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
      #       html.fromstring does auto-detection.
      with open(f_path[0:-4] + "txt", "w") as f:
      	f.write(unwrap_text_in_html(data))

    if file_type == "zip":
      # This is the entire package in a ZIP file. Extract the contents of this file
      # to the appropriate paths.
      with zipfile.ZipFile(f_path) as zf:
        for z2 in zf.namelist():
          if not z2.startswith(package_name + "/"): raise ValueError("Unmatched file name in package ZIP: " + z2)
          z2 = z2[len(package_name)+1:] # strip off leading package name

          if z2 in ("mods.xml", "premis.xml", "dip.xml"):
            # Extract this file to a file of the same name.
            z3 = path + "/" + z2
          elif z2 == "pdf/" + package_name + ".pdf":
            # Extract this file to "document.pdf".
            z3 = path + "/document.pdf"
          elif z2 == "html/" + package_name + ".htm":
            # Extract this file and unwrap text to "document.txt".
            z3 = path + "/document.txt"
          else:
            raise ValueError("Unmatched file name in package ZIP: " + z2)

          with zf.open(package_name + "/" + z2) as zff:
            with open(z3, "w") as output_file:
              data = zff.read()
              if z3 == path + "/document.txt": data = unwrap_text_in_html(data)
              output_file.write(data)
        
  if collection == "BILLS" and "mods" in updated_file_types:
    # When we download bill files, also create the text-versions/data.json file
    # which extracts commonly used components of the MODS XML.
    from bill_versions import write_bill_version_metadata
    write_bill_version_metadata(get_bill_id_for_package(package_name, with_version=True))

  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the files for this sitemap item.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file) 
Example #3
0
def mirror_file(year, collection, package_name, lastmod, granule_name,
                file_types, options):
    # Where should we store the file?
    path = get_output_path(year, collection, package_name, granule_name,
                           options)
    if not path: return  # should skip

    # Do we need to update this record?
    lastmod_cache_file = path + "/lastmod.txt"
    cache_lastmod = utils.read(lastmod_cache_file)
    force = ((lastmod != cache_lastmod) or options.get(
        "force", False)) and not options.get("cached", False)

    # Try downloading files for each file type.
    targets = get_package_files(package_name, granule_name, path)
    updated_file_types = set()
    for file_type in file_types:
        if file_type not in targets:
            raise Exception("Invalid file type: %s" % file_type)
        f_url, f_path = targets[file_type]

        if (not force) and os.path.exists(f_path):
            continue  # we already have the current file
        logging.warn("Downloading: " + f_path)
        data = utils.download(
            f_url, f_path,
            utils.merge(
                options, {
                    'binary':
                    True,
                    'force':
                    force,
                    'to_cache':
                    False,
                    'needs_content':
                    file_type == "text" and f_path.endswith(".html"),
                }))
        updated_file_types.add(file_type)

        if not data:
            if file_type == "pdf":
                # expected to be present for all packages
                raise Exception("Failed to download %s" % package_name)
            else:
                # not all packages have all file types, but assume this is OK
                logging.error("file not found: " + f_url)
                continue

        if file_type == "text" and f_path.endswith(".html"):
            # The "text" format files are put in an HTML container. Unwrap it into a .txt file.
            # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
            #       html.fromstring does auto-detection.
            with open(f_path[0:-4] + "txt", "w") as f:
                text_content = unicode(html.fromstring(data).text_content())
                f.write(text_content.encode("utf8"))

    if collection == "BILLS" and "mods" in updated_file_types:
        # When we download bill files, also create the text-versions/data.json file
        # which extracts commonly used components of the MODS XML.
        from bill_versions import write_bill_version_metadata
        write_bill_version_metadata(
            get_bill_id_for_package(package_name, with_version=True))

    # Write the current last modified date to disk so we know the next time whether
    # we need to fetch the files for this sitemap item.
    if lastmod and not options.get("cached", False):
        utils.write(lastmod, lastmod_cache_file)