def proc_statute_volume(path, options): mods = etree.parse(path + "/mods.xml") mods_ns = { "mods": "http://www.loc.gov/mods/v3" } # Load the THOMAS committee names for this Congress, which is our best # bet for normalizing committee names in the GPO data. congress = mods.find( "/mods:extension[2]/mods:congress", mods_ns ).text utils.fetch_committee_names(congress, options) logging.warn("Processing %s (Congress %s)" % (path, congress)) package_id = mods.find( "/mods:extension[2]/mods:accessId", mods_ns ).text for bill in mods.findall( "/mods:relatedItem", mods_ns ): # MODS files also contain information about: # ['BACKMATTER', 'FRONTMATTER', 'CONSTAMEND', 'PROCLAMATION', 'REORGPLAN'] if bill.find( "mods:extension/mods:granuleClass", mods_ns ).text not in [ "PUBLICLAW", "PRIVATELAW", "HCONRES", "SCONRES" ]: continue # Get the title and source URL (used in error messages). title_text = bill.find( "mods:titleInfo/mods:title", mods_ns ).text.replace( '""', '"' ) source_url = bill.find( "mods:location/mods:url[@displayLabel='Content Detail']", mods_ns ).text # Bill number bill_elements = bill.findall( "mods:extension/mods:bill[@priority='primary']", mods_ns ) if len(bill_elements) == 0: logging.error("No bill number identified for '%s' (%s)" % (title_text, source_url)) continue elif len(bill_elements) > 1: logging.error("Multiple bill numbers identified for '%s'" % title_text) for be in bill_elements: logging.error(" -- " + etree.tostring(be).strip()) logging.error(" @ " + source_url) continue else: bill_congress = bill_elements[0].attrib["congress"] bill_type = bill_elements[0].attrib["type"].lower() bill_number = bill_elements[0].attrib["number"] bill_id = "%s%s-%s" % (bill_type, bill_number, bill_congress) # Title titles = [] titles.append( { "title": title_text, "as": "enacted", "type": "official", "is_for_portion": False, } ) # Subject descriptor = bill.find( "mods:extension/mods:descriptor", mods_ns ) if descriptor is not None: subject = descriptor.text else: subject = None # Committees committees = [] cong_committee = bill.find( "mods:extension/mods:congCommittee", mods_ns ) if cong_committee is not None: chambers = { "H": "House", "S": "Senate", "J": "Joint" } committee = chambers[cong_committee.attrib["chamber"]] + " " + cong_committee.find( "mods:name", mods_ns ).text committee_info = { "committee": committee, "activity": [], # XXX "committee_id": utils.committee_names[committee] if committee in utils.committee_names else None, } committees.append( committee_info ) # The 'granuleDate' is the enactment date? granule_date = bill.find( "mods:extension/mods:granuleDate", mods_ns ).text sources = [{ "source": "statutes", "package_id": package_id, "access_id": bill.find( "mods:extension/mods:accessId", mods_ns ).text, "source_url": source_url, "volume": bill.find( "mods:extension/mods:volume", mods_ns ).text, "page": bill.find( "mods:part[@type='article']/mods:extent[@unit='pages']/mods:start", mods_ns ).text, "position": bill.find( "mods:extension/mods:pagePosition", mods_ns ).text, }] law_elements = bill.findall( "mods:extension/mods:law", mods_ns ) # XXX: If <law> is missing, this assumes it is a concurrent resolution. # This may be a problem if the code is updated to accept joint resolutions for constitutional amendments. if ( law_elements is None ) or ( len( law_elements ) != 1 ): other_chamber = { "HOUSE": "s", "SENATE": "h" } actions = [{ "type": "vote", "vote_type": "vote2", "where": other_chamber[bill.find( "mods:extension/mods:originChamber", mods_ns ).text], "result": "pass", # XXX "how": "unknown", # XXX # "text": "", "acted_at": granule_date, # XXX "status": "PASSED:CONCURRENTRES", "references": [], # XXX }] else: law_congress = law_elements[0].attrib["congress"] law_number = law_elements[0].attrib["number"] law_type = ( "private" if ( law_elements[0].attrib["isPrivate"] == "true" ) else "public" ) # Check for typos in the metadata. if law_congress != bill_congress: logging.error("Congress mismatch for %s%s: %s or %s? (%s)" % ( bill_type, bill_number, bill_congress, law_congress, source_url ) ) continue actions = [{ "congress": law_congress, "number": law_number, "type": "enacted", "law": law_type, "text": "Became %s Law No: %s-%s." % ( law_type.capitalize(), law_congress, law_number ), "acted_at": granule_date, # XXX "status": "ENACTED:SIGNED", # XXX: Check for overridden vetoes! "references": [], # XXX }] status, status_date = bill_info.latest_status( actions ) bill_data = { 'bill_id': bill_id, 'bill_type': bill_type, 'number': bill_number, 'congress': bill_congress, 'introduced_at': None, # XXX 'sponsor': None, # XXX 'cosponsors': [], # XXX 'actions': actions, # XXX 'history': bill_info.history_from_actions( actions ), 'status': status, 'status_at': status_date, 'enacted_as': bill_info.slip_law_from( actions ), 'titles': titles, 'official_title': bill_info.current_title_for( titles, "official" ), 'short_title': bill_info.current_title_for( titles, "short" ), # XXX 'popular_title': bill_info.current_title_for( titles, "popular" ), # XXX 'subjects_top_term': subject, 'subjects': [], 'related_bills': [], # XXX: <associatedBills> usually only lists the current bill. 'committees': committees, 'amendments': [], # XXX 'sources': sources, 'updated_at': datetime.datetime.fromtimestamp(time.time()), } if not options.get('textversions', False): bill_info.output_bill( bill_data, options ) # XXX: Can't use bill_versions.fetch_version() because it depends on fdsys. version_code = "enr" bill_version_id = "%s%s-%s-%s" % ( bill_type, bill_number, bill_congress, version_code ) bill_version = { 'bill_version_id': bill_version_id, 'version_code': version_code, 'issued_on': status_date, 'urls': { "pdf": bill.find( "mods:location/mods:url[@displayLabel='PDF rendition']", mods_ns ).text }, 'sources': sources, } utils.write( json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime), bill_versions.output_for_bill_version(bill_version_id) ) # Process the granule PDF. # - Hard-link it into the right place to be seen as bill text. # - Run "pdftotext -layout" to convert it to plain text and save it in the bill text location. pdf_file = path + "/" + sources[0]["access_id"] + "/document.pdf" if os.path.exists(pdf_file): dst_path = fdsys.output_for_bill(bill_data["bill_id"], "text-versions/" + version_code, is_data_dot=False) if options.get("linkpdf", False): os.link(pdf_file, dst_path + "/document.pdf") # a good idea if options.get("extracttext", False): logging.error("Running pdftotext on %s..." % pdf_file) if subprocess.call(["pdftotext", "-layout", pdf_file, dst_path + "/document.txt"]) != 0: raise Exception("pdftotext failed on %s" % pdf_file) return {'ok': True, 'saved': True}
def proc_statute(path, options): mods = etree.parse(path + "/mods.xml") mods_ns = { "mods": "http://www.loc.gov/mods/v3" } # Load the THOMAS committee names for this Congress, which is our best # bet for normalizing committee names in the GPO data. congress = mods.find( "/mods:extension[2]/mods:congress", mods_ns ).text utils.fetch_committee_names(congress, options) logging.warn("Processing %s (Congress %s)" % (path, congress)) for bill in mods.findall( "/mods:relatedItem", mods_ns ): titles = [] titles.append( { "title": bill.find( "mods:titleInfo/mods:title", mods_ns ).text.replace( '""', '"' ), "as": "enacted", "type": "official", } ) descriptor = bill.find( "mods:extension/mods:descriptor", mods_ns ) if descriptor is not None: subject = descriptor.text else: subject = None # MODS files also contain information about: # ['BACKMATTER', 'FRONTMATTER', 'CONSTAMEND', 'PROCLAMATION', 'REORGPLAN'] if bill.find( "mods:extension/mods:granuleClass", mods_ns ).text not in [ "PUBLICLAW", "PRIVATELAW", "HCONRES", "SCONRES" ]: continue committees = [] cong_committee = bill.find( "mods:extension/mods:congCommittee", mods_ns ) if cong_committee is not None: chambers = { "H": "House", "S": "Senate", "J": "Joint" } committee = chambers[cong_committee.attrib["chamber"]] + " " + cong_committee.find( "mods:name", mods_ns ).text committee_info = { "committee": committee, "activity": [], # XXX "committee_id": utils.committee_names[committee] if committee in utils.committee_names else None, } committees.append( committee_info ) bill_elements = bill.findall( "mods:extension/mods:bill", mods_ns ) if ( bill_elements is None ) or ( len( bill_elements ) != 1 ): logging.error("Could not get bill data for %s" % repr(titles) ) continue else: bill_congress = bill_elements[0].attrib["congress"] bill_type = bill_elements[0].attrib["type"].lower() bill_number = bill_elements[0].attrib["number"] bill_id = "%s%s-%s" % (bill_type, bill_number, bill_congress) actions = [] law_elements = bill.findall( "mods:extension/mods:law", mods_ns ) # XXX: If <law> is missing, this assumes it is a concurrent resolution. # This may be a problem if the code is updated to accept joint resolutions for constitutional amendments. if ( law_elements is None ) or ( len( law_elements ) != 1 ): other_chamber = { "HOUSE": "s", "SENATE": "h" } action = { "type": "vote", "vote_type": "vote2", "where": other_chamber[bill.find( "mods:extension/mods:originChamber", mods_ns ).text], "result": "pass", # XXX "how": "unknown", # XXX # "text": "", "acted_at": bill.find( "mods:extension/mods:granuleDate", mods_ns ).text, # XXX "status": "PASSED:CONCURRENTRES", "references": [], # XXX } else: law_congress = law_elements[0].attrib["congress"] law_number = law_elements[0].attrib["number"] law_type = ( "private" if ( law_elements[0].attrib["isPrivate"] == "true" ) else "public" ) action = { "congress": law_congress, "number": law_number, "type": "enacted", "law": law_type, "text": "Became %s Law No: %s-%s." % ( law_type.capitalize(), law_congress, law_number ), "acted_at": bill.find( "mods:extension/mods:granuleDate", mods_ns ).text, # XXX "status": "ENACTED:SIGNED", # XXX: Check for overridden vetoes! "references": [], # XXX } actions.append( action ) # Check for typos in the metadata. if law_congress != bill_congress: logging.error("Congress mismatch for %s%s: %s or %s?" % ( bill_type, bill_number, bill_congress, law_congress ) ) continue status, status_date = bill_info.latest_status( actions ) bill_data = { 'bill_id': bill_id, 'bill_type': bill_type, 'number': bill_number, 'congress': bill_congress, 'introduced_at': None, # XXX 'sponsor': None, # XXX 'cosponsors': [], # XXX 'actions': actions, # XXX 'history': bill_info.history_from_actions( actions ), 'status': status, 'status_at': status_date, 'enacted_as': bill_info.slip_law_from( actions ), 'titles': titles, 'official_title': bill_info.current_title_for( titles, "official" ), 'short_title': bill_info.current_title_for( titles, "short" ), # XXX 'popular_title': bill_info.current_title_for( titles, "popular" ), # XXX # 'summary': summary, 'subjects_top_term': subject, 'subjects': [], 'related_bills': [], # XXX: <associatedBills> usually only lists the current bill. 'committees': committees, 'amendments': [], # XXX 'updated_at': datetime.datetime.fromtimestamp(time.time()), } bill_info.output_bill( bill_data, options ) # XXX: Can't use bill_versions.fetch_version() because it depends on fdsys. version_code = "enr" bill_version_id = "%s%s-%s-%s" % ( bill_type, bill_number, bill_congress, version_code ) bill_version = { 'bill_version_id': bill_version_id, 'version_code': version_code, 'issued_on': status_date, 'urls': { "pdf": bill.find( "mods:location/mods:url[@displayLabel='PDF rendition']", mods_ns ).text }, } import json, bill_versions utils.write( json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime), bill_versions.output_for_bill_version(bill_version_id) ) return {'ok': True, 'saved': True}