def mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options): # Where should we store the file? path = get_output_path(year, collection, package_name, granule_name, options) if not path: return # should skip # Do we need to update this record? lastmod_cache_file = path + "/lastmod.txt" cache_lastmod = utils.read(lastmod_cache_file) force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False) # Try downloading files for each file type. targets = get_package_files(package_name, granule_name, path) updated_file_types = set() for file_type in file_types: if file_type not in targets: raise Exception("Invalid file type: %s" % file_type) f_url, f_path = targets[file_type] if (not force) and os.path.exists(f_path): continue # we already have the current file logging.warn("Downloading: " + f_path) data = utils.download(f_url, f_path, utils.merge(options, { 'binary': True, 'force': force, 'to_cache': False, 'needs_content': file_type == "text" and f_path.endswith(".html"), })) updated_file_types.add(file_type) if not data: if file_type == "pdf": # expected to be present for all packages raise Exception("Failed to download %s" % package_name) else: # not all packages have all file types, but assume this is OK logging.error("file not found: " + f_url) continue if file_type == "text" and f_path.endswith(".html"): # The "text" format files are put in an HTML container. Unwrap it into a .txt file. # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it? # html.fromstring does auto-detection. with open(f_path[0:-4] + "txt", "w") as f: text_content = unicode(html.fromstring(data).text_content()) f.write(text_content.encode("utf8")) if collection == "BILLS" and "mods" in updated_file_types: # When we download bill files, also create the text-versions/data.json file # which extracts commonly used components of the MODS XML. from bill_versions import write_bill_version_metadata write_bill_version_metadata(get_bill_id_for_package(package_name, with_version=True)) # Write the current last modified date to disk so we know the next time whether # we need to fetch the files for this sitemap item. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file)
def mirror_package(year, collection, package_name, lastmod, granule_name, file_types, options): # Where should we store the file? path = get_output_path(year, collection, package_name, granule_name, options) if not path: return # should skip # Do we need to update this record? lastmod_cache_file = path + "/lastmod.txt" cache_lastmod = utils.read(lastmod_cache_file) force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False) # Try downloading files for each file type. targets = get_package_files(package_name, granule_name, path) updated_file_types = set() for file_type in file_types: if file_type not in targets: raise Exception("Invalid file type: %s" % file_type) # For BILLS, XML was not available until the 108th Congress, though even after that # it was spotty until the 111th or so Congress. if file_type == "xml" and collection == "BILLS" and int(package_name[6:9]) < 108: continue f_url, f_path = targets[file_type] if (not force) and os.path.exists(f_path): continue # we already have the current file logging.warn("Downloading: " + f_path) data = utils.download(f_url, f_path, utils.merge(options, { 'binary': True, 'force': force, 'to_cache': False, 'needs_content': file_type == "text" and f_path.endswith(".html"), })) updated_file_types.add(file_type) if not data: if file_type in ("pdf", "zip"): # expected to be present for all packages raise Exception("Failed to download %s" % package_name) else: # not all packages have all file types, but assume this is OK logging.error("file not found: " + f_url) continue if file_type == "text" and f_path.endswith(".html"): # The "text" format files are put in an HTML container. Unwrap it into a .txt file. # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it? # html.fromstring does auto-detection. with open(f_path[0:-4] + "txt", "w") as f: f.write(unwrap_text_in_html(data)) if file_type == "zip": # This is the entire package in a ZIP file. Extract the contents of this file # to the appropriate paths. with zipfile.ZipFile(f_path) as zf: for z2 in zf.namelist(): if not z2.startswith(package_name + "/"): raise ValueError("Unmatched file name in package ZIP: " + z2) z2 = z2[len(package_name)+1:] # strip off leading package name if z2 in ("mods.xml", "premis.xml", "dip.xml"): # Extract this file to a file of the same name. z3 = path + "/" + z2 elif z2 == "pdf/" + package_name + ".pdf": # Extract this file to "document.pdf". z3 = path + "/document.pdf" elif z2 == "html/" + package_name + ".htm": # Extract this file and unwrap text to "document.txt". z3 = path + "/document.txt" else: raise ValueError("Unmatched file name in package ZIP: " + z2) with zf.open(package_name + "/" + z2) as zff: with open(z3, "w") as output_file: data = zff.read() if z3 == path + "/document.txt": data = unwrap_text_in_html(data) output_file.write(data) if collection == "BILLS" and "mods" in updated_file_types: # When we download bill files, also create the text-versions/data.json file # which extracts commonly used components of the MODS XML. from bill_versions import write_bill_version_metadata write_bill_version_metadata(get_bill_id_for_package(package_name, with_version=True)) # Write the current last modified date to disk so we know the next time whether # we need to fetch the files for this sitemap item. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file)
def mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options): # Where should we store the file? path = get_output_path(year, collection, package_name, granule_name, options) if not path: return # should skip # Do we need to update this record? lastmod_cache_file = path + "/lastmod.txt" cache_lastmod = utils.read(lastmod_cache_file) force = ((lastmod != cache_lastmod) or options.get( "force", False)) and not options.get("cached", False) # Try downloading files for each file type. targets = get_package_files(package_name, granule_name, path) updated_file_types = set() for file_type in file_types: if file_type not in targets: raise Exception("Invalid file type: %s" % file_type) f_url, f_path = targets[file_type] if (not force) and os.path.exists(f_path): continue # we already have the current file logging.warn("Downloading: " + f_path) data = utils.download( f_url, f_path, utils.merge( options, { 'binary': True, 'force': force, 'to_cache': False, 'needs_content': file_type == "text" and f_path.endswith(".html"), })) updated_file_types.add(file_type) if not data: if file_type == "pdf": # expected to be present for all packages raise Exception("Failed to download %s" % package_name) else: # not all packages have all file types, but assume this is OK logging.error("file not found: " + f_url) continue if file_type == "text" and f_path.endswith(".html"): # The "text" format files are put in an HTML container. Unwrap it into a .txt file. # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it? # html.fromstring does auto-detection. with open(f_path[0:-4] + "txt", "w") as f: text_content = unicode(html.fromstring(data).text_content()) f.write(text_content.encode("utf8")) if collection == "BILLS" and "mods" in updated_file_types: # When we download bill files, also create the text-versions/data.json file # which extracts commonly used components of the MODS XML. from bill_versions import write_bill_version_metadata write_bill_version_metadata( get_bill_id_for_package(package_name, with_version=True)) # Write the current last modified date to disk so we know the next time whether # we need to fetch the files for this sitemap item. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file)