def get_output_path(collection, package_name, options): # Where to store the document files? # The path will depend a bit on the collection. if collection == "BILLS": # Store with the other bill data ([congress]/bills/[billtype]/[billtype][billnumber]). bill_and_ver = get_bill_id_for_package( package_name, with_version=False, restrict_to_congress=options.get("congress")) if not bill_and_ver: return None # congress number does not match options["congress"] from bills import output_for_bill bill_id, version_code = bill_and_ver return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False) elif collection == "CRPT": # Store committee reports in [congress]/crpt/[reporttype]. m = re.match(r"(\d+)([hse]rpt)(\d+)$", package_name) if not m: raise ValueError(package_name) congress, report_type, report_number = m.groups() if options.get("congress") and congress != options.get("congress"): return None # congress number does not match options["congress"] return "%s/%s/%s/%s/%s" % (utils.data_dir(), congress, collection.lower(), report_type, report_type + report_number) else: # Store in govinfo/COLLECTION/PKGNAME. path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection, package_name) return path
def get_output_path(collection, package_name, options): # Where to store the document files? # The path will depend a bit on the collection. if collection == "BILLS": # Store with the other bill data ([congress]/bills/[billtype]/[billtype][billnumber]). bill_and_ver = get_bill_id_for_package(package_name, with_version=False, restrict_to_congress=options.get("congress")) if not bill_and_ver: return None # congress number does not match options["congress"] from bills import output_for_bill bill_id, version_code = bill_and_ver return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False) elif collection == "CRPT": # Store committee reports in [congress]/crpt/[reporttype]. m = re.match(r"(\d+)([hse]rpt)(\d+)$", package_name) if not m: raise ValueError(package_name) congress, report_type, report_number = m.groups() if options.get("congress") and congress != options.get("congress"): return None # congress number does not match options["congress"] return "%s/%s/%s/%s/%s" % (utils.data_dir(), congress, collection.lower(), report_type, report_type + report_number) else: # Store in govinfo/COLLECTION/PKGNAME. path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection, package_name) return path
def get_output_path(sitemap, package_name, granule_name, options): # Where to store the document files? # The path will depend a bit on the collection. if sitemap["collection"] == "BILLS": # Store with the other bill data. bill_and_ver = get_bill_id_for_package( package_name, with_version=False, restrict_to_congress=options.get("congress")) if not bill_and_ver: return None # congress number does not match options["congress"] from bills import output_for_bill bill_id, version_code = bill_and_ver return output_for_bill(bill_id, "text-versions/" + version_code, is_data_dot=False) else: # Store in fdsys/COLLECTION/YEAR/PKGNAME[/GRANULE_NAME]. path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), sitemap["collection"], sitemap["year"], package_name) if granule_name: path += "/" + granule_name return path
def mirror_bulkdata_file(sitemap, url, item_path, lastmod, options): # Return a list of files we downloaded. results = [] # Where should we store the file? path = "%s/fdsys/%s/%s" % (utils.data_dir(), sitemap["collection"], item_path) # For BILLSTATUS, store this along with where we store the rest of bill # status data. if sitemap["collection"] == "BILLSTATUS": from bills import output_for_bill bill_id, version_code = get_bill_id_for_package(os.path.splitext( os.path.basename(item_path))[0], with_version=False) path = output_for_bill(bill_id, FDSYS_BILLSTATUS_FILENAME, is_data_dot=False) # Where should we store the lastmod found in the sitemap so that # we can tell later if the file has changed? lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt" # Do we already have this file up to date? if os.path.exists(lastmod_cache_file) and not options.get("force", False): if lastmod == utils.read(lastmod_cache_file): return # With --cached, skip if the file is already downloaded. if os.path.exists(path) and options.get("cached", False): return # Download. logging.warn("Downloading: " + path) data = utils.download( url, path, utils.merge( options, { 'binary': True, 'force': True, # decision to cache was made above 'to_cache': False, })) results.append(path) if not data: # Something failed. return # Write the current last modified date back to disk so we know the next time whether # we need to fetch the file again. utils.write(lastmod, lastmod_cache_file) return results
def mirror_bulkdata_file(collection, url, item_path, lastmod, options): # Return a list of files we downloaded. results = [] # Where should we store the file? path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection, item_path) # For BILLSTATUS, store this along with where we store the rest of bill # status data. if collection == "BILLSTATUS": from bills import output_for_bill bill_id, version_code = get_bill_id_for_package(os.path.splitext(os.path.basename(item_path.replace("BILLSTATUS-", "")))[0], with_version=False) path = output_for_bill(bill_id, FDSYS_BILLSTATUS_FILENAME, is_data_dot=False) # Where should we store the lastmod found in the sitemap so that # we can tell later if the file has changed? lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt" # Do we already have this file up to date? if os.path.exists(lastmod_cache_file) and not options.get("force", False): if lastmod == utils.read(lastmod_cache_file): return # With --cached, skip if the file is already downloaded. if os.path.exists(path) and options.get("cached", False): return # Download. logging.warn("Downloading: " + path) data = utils.download(url, path, utils.merge(options, { 'binary': True, 'force': True, # decision to cache was made above 'to_cache': False, })) results.append(path) if not data: # Something failed. return # Write the current last modified date back to disk so we know the next time whether # we need to fetch the file again. utils.write(lastmod, lastmod_cache_file) return results
def fetch_floor_week(for_the_week, options): base_url = 'https://docs.house.gov/floor/Download.aspx?file=/billsthisweek/' week_url = base_url + '%s/%s.xml' % (for_the_week, for_the_week) # Turn on 'force' to re-download the schedules, by default, since the content # changes frequently and we're scanning weeks that might have 404'd previously # when we looked ahead. We leave 'force' off for downloading the file attachments. options2 = dict(options) if "force" not in options2: options2["force"] = True body = utils.download(week_url, 'upcoming_house_floor/%s.xml' % for_the_week, options2) if "was not found" in body: return None dom = lxml.etree.fromstring(body) # can download the actual attached files to disk, if asked download = options.get("download", False) # always present at the feed level congress = int(dom.xpath('//floorschedule')[0].get('congress-num')) # week of this day, e.g. '2013-01-21' legislative_day = for_the_week[0:4] + '-' + for_the_week[ 4:6] + '-' + for_the_week[6:] upcoming = [] for node in dom.xpath('//floorschedule/category/floor-items/floor-item'): bill_number = node.xpath('legis-num//text()')[0] # TODO: fetch non-bills too if not bill_number: logging.warn("Skipping item, not a bill: %s" % description) continue description = node.xpath('floor-text//text()')[0] # how is this bill being considered? category = next(node.iterancestors("category")).get('type') if "suspension" in category: consideration = "suspension" elif "pursuant" in category: consideration = "rule" else: consideration = "unknown" logging.warn("[%s]" % bill_number) # todo: establish most recent date from a combo of added, published, updates date = date_for(node.get('publish-date')) # all items will have this bill = { 'description': description, 'floor_item_id': node.get('id'), 'consideration': consideration, 'published_at': date_for(node.get('publish-date')), 'added_at': date_for(node.get('add-date')), } # treat drafts and numbered bills a bit differently if "_" in bill_number: draft_bill_id = draft_bill_id_for(bill_number, date, congress) bill['item_type'] = 'draft_bill' bill['draft_bill_id'] = draft_bill_id else: m = re.match( "(Concur(ring)? in )?(?P<type>((the )?(Senate|House) Amendments? (with an amendment )?to )+)(?P<bill>.*)", bill_number, re.I) if m: amendment_type = m.group("type").split("to")[0] if "Senate" in amendment_type and "House" not in amendment_type: bill['item_type'] = 'senate_amendment' elif "House" in amendment_type and "Senate" not in amendment_type: bill['item_type'] = 'house_amendment' else: raise ValueError(bill_number) bill_number = m.group("bill") elif re.match("Conference report to accompany ", bill_number, re.I): bill['item_type'] = 'conference_report' bill_number = bill_number.lower().replace( "conference report to accompany ", '') else: bill['item_type'] = 'bill' # In one case we got "H. Res. 497 (H. Rept. 116-125)". # Stop at parens. bill_number = re.sub(r"\(.*", "", bill_number) try: bill['bill_id'] = bill_id_for(bill_number.strip(), congress) except ValueError: logging.error("Could not parse bill from: %s" % bill_number) continue bill['files'] = [] for file in node.xpath('files/file'): file_url = file.get('doc-url') filename = file_url.split('/')[-1] file_format = file.get('doc-type').lower() logging.warn("\t%s file for %s: %s" % (file_format.upper(), bill_number, filename)) file_field = { 'url': file_url, 'format': file_format, 'added_at': date_for(file.get('add-date')), 'published_at': date_for(file.get('publish-date')) } bill['files'].append(file_field) # now try downloading the file to disk and linking it to the data if not download: continue try: file_path = 'upcoming_house_floor/%s/%s' % (for_the_week, filename) try: os.makedirs( os.path.join(utils.data_dir(), os.path.dirname(file_path))) except OSError: pass # directory exists options3 = dict(options) options3[ "to_cache"] = False # put in the actual specified directory options3[ "binary"] = True # force binary mode, no file escaping utils.download(file_url, os.path.join(utils.data_dir(), file_path), options3) file_field['path'] = file_path except IOError: logging.error( "Omitting 'path', couldn't download file %s from House floor for the week of %s" % (file_field['url'], for_the_week)) continue # if it's a PDF, convert to text and extract XML if file_format == "pdf" and file_path.endswith(".pdf"): # extract text text_path = file_path.replace(".pdf", ".txt") if subprocess.call([ "pdftotext", "-layout", os.path.join(utils.data_dir(), file_path), os.path.join(utils.data_dir(), text_path) ], universal_newlines=True) != 0: raise Exception("pdftotext failed on %s" % file_path) file_field['text_path'] = text_path # extract embedded XML for line in subprocess.check_output( [ "pdfdetach", "-list", os.path.join(utils.data_dir(), file_path) ], universal_newlines=True).split("\n"): m = re.match(r"(\d+):\s*(.*)", line) if m: attachment_n, attachment_fn = m.groups() if attachment_fn.endswith(".xml"): text_path = file_path.replace(".pdf", ".xml") subprocess.check_call([ "pdfdetach", os.path.join(utils.data_dir(), file_path), "-save", attachment_n, "-o", os.path.join(utils.data_dir(), text_path) ], universal_newlines=True) file_field['xml_path'] = text_path upcoming.append(bill) if "bill_id" in bill: # Save this bill data to the bill's bill text directory. text_data_path = output_for_bill( bill['bill_id'], os.path.join("text-versions", "dhg-" + bill["floor_item_id"] + ".json"), is_data_dot=False) try: os.makedirs( os.path.join(utils.data_dir(), os.path.dirname(text_data_path))) except OSError: pass # directory exists utils.write( json.dumps(bill, sort_keys=True, indent=2, default=utils.format_datetime), text_data_path) # Create and return the house floor file data. house_floor = { 'congress': congress, 'week_of': legislative_day, 'upcoming': upcoming } return house_floor