def test_section_intro_references(self): reg_xml = etree.fromstring(""" <section label="1024-3" sectionNum="3" xmlns="eregs"> <subject>§ 1024.3 Questions or suggestions from public and copies of public guidance documents.</subject> <paragraph label="1024-3-p1" marker=""> <content>Any questions regarding <ref target="1024-defs" reftype="term">RESPA</ref>. </content> </paragraph> <paragraph label="1024-defs" marker=""> <content>This paragraph contains references for the term <def term="respa">RESPA</def>. </content> </paragraph> </section>""") result = build_terms_layer(reg_xml) # This paragraph is an intro paragraph, so for reg-site the content gets pushed into the section's text area # Therefore, the terms layer should have the reference for the section's label, not the paragraph's label expected_result = OrderedDict([('1024-3', [OrderedDict([(u'offsets', [[24, 29]]), (u'ref', u'respa:1024-defs')])]), (u'referenced', OrderedDict([(u'respa:1024-defs', OrderedDict([(u'position', [48, 53]), (u'reference', '1024-defs'), (u'term', 'respa')]))]))]) self.assertEqual(expected_result, result)
def validate(file, no_terms=False, no_citations=False, no_keyterms=False): """ Validate a RegML file """ file = find_file(file) with open(file, 'r') as f: reg_xml = f.read() parser = etree.XMLParser(huge_tree=True) xml_tree = etree.fromstring(reg_xml, parser) # Validate the file relative to schema validator = get_validator(xml_tree) # Validate regulation-specific documents if xml_tree.tag == '{eregs}regulation': terms = build_terms_layer(xml_tree) internal_citations = build_internal_citations_layer(xml_tree) if not no_terms: validator.validate_terms(xml_tree, terms) if not no_citations: validator.validate_internal_cites(xml_tree, internal_citations) if not no_keyterms: validator.validate_keyterms(xml_tree) for event in validator.events: print(str(event)) # Validate notice-specific documents if xml_tree.tag == '{eregs}notice': pass return validator
def test_appendix_intro_references(self): reg_xml = etree.fromstring(""" <appendixSection appendixSecNum="1" label="1024-B-s1" xmlns="eregs"> <subject/> <paragraph label="1024-B-p1-0" marker=""> <content>The following illustrations provide provisions of <ref target="1024-defs" reftype="term">RESPA</ref>. </content> </paragraph> <paragraph label="1024-B-p1-1" marker=""> <content>Refer to the <ref target="1024-defs" reftype="term">Bureau</ref>'s regulations for <ref target="1024-defs" reftype="term">HUD-1</ref>. </content> </paragraph> <paragraph label="1024-defs" marker=""> <content>This paragraph contains terms <def term="bureau">Bureau</def>, <def term="respa">RESPA</def>, and <def term="hud-1">HUD-1</def>. </content> </paragraph> </appendixSection>""") result = build_terms_layer(reg_xml) # This paragraph is an intro paragraph, so for reg-site the content gets pushed into the appendixSection text # Therefore, the terms layer should have the reference for the appendixSection's label, not the paragraph label # This also checks that only the first paragraph becomes an intro paragraph. expected_result = OrderedDict([('1024-B-s1', [OrderedDict([(u'offsets', [[50, 55]]), (u'ref', u'bureau:1024-defs')])]), ('1024-B-p1-1', [OrderedDict([(u'offsets', [[13, 19]]), (u'ref', u'bureau:1024-defs')]), OrderedDict([(u'offsets', [[38, 43]]), (u'ref', u'bureau:1024-defs')])]), (u'referenced', OrderedDict([ (u'bureau:1024-defs', OrderedDict([(u'position', [30, 36]), (u'reference', '1024-defs'), (u'term', 'bureau')])), (u'respa:1024-defs', OrderedDict([(u'position', [38, 43]), (u'reference', '1024-defs'), (u'term', 'respa')])), (u'hud-1:1024-defs', OrderedDict([(u'position', [49, 54]), (u'reference', '1024-defs'), (u'term', 'hud-1')]))]))]) self.assertEqual(expected_result, result)
def check_terms(file, label=None, term=None): """ Check the terms in a RegML file """ file = find_file(file) with open(file, 'r') as f: reg_xml = f.read() xml_tree = etree.fromstring(reg_xml) if xml_tree.tag == '{eregs}notice': print("Cannot check terms in notice files") sys.exit(1) # Validate the file relative to schema validator = get_validator(xml_tree) terms = build_terms_layer(xml_tree) validator.validate_terms(xml_tree, terms) validator.validate_term_references(xml_tree, terms, file, label=label, term=term)
def test_para_with_defs_offsets(self): reg_xml = etree.fromstring(""" <appendixSection appendixSecNum="1" label="1024-s1" xmlns="eregs"> <subject/> <paragraph label="1024-defs" marker="1."> <title type="keyterm">Definitions.</title> <content>This paragraph contains definitions to check offsets, like <def term="bureau">Bureau</def>. </content> </paragraph> </appendixSection>""") result = build_terms_layer(reg_xml) # This paragraph is a paragraph with definitions and a title (type: keyterm) to test # that the appropriate offsets are calculated for both marker and title. expected_result = OrderedDict([(u'referenced', OrderedDict([(u'bureau:1024-defs', OrderedDict([(u'position', [74, 80]), (u'reference', '1024-defs'), (u'term', 'bureau')]))]) )]) self.assertEqual(expected_result, result)
def check_terms(file, label=None, term=None, with_notice=None): """ Check the terms in a RegML file """ file = find_file(file) with open(file, 'r') as f: reg_string = f.read() parser = etree.XMLParser(huge_tree=True) reg_tree = etree.fromstring(reg_string, parser) if reg_tree.tag == '{eregs}notice': print("Cannot check terms in notice files directly.") print("Use a regulation file and --with-notice to specify the notice that applies.") sys.exit(1) # If we're given a notice, apply it to the given regulation file, # then check terms in the result and write it out to the notice file # as changes. notice_tree = None if with_notice is not None: # file is changed here so the term checker will write the notice # instead of the regulation file = find_file(with_notice, is_notice=True) with open(file, 'r') as f: notice_xml = f.read() notice_tree = etree.fromstring(notice_xml) # Process the notice changeset print(colored('Applying notice...', attrs=['bold'])) reg_tree = process_changes(reg_tree, notice_tree) # Validate the file relative to schema validator = get_validator(reg_tree) terms = build_terms_layer(reg_tree) validator.validate_terms(reg_tree, terms) validator.validate_term_references(reg_tree, terms, file, label=label, term=term, notice=notice_tree)
def apply_through(cfr_title, cfr_part, start=None, through=None, fix_notices=False, skip_fix_notices=[], skip_fix_notices_through=None): # Get list of notices that apply to this reg # Look for locally available notices regml_notice_files = find_all(cfr_part, is_notice=True) regml_notices = [] for notice_file in regml_notice_files: file_name = os.path.join(notice_file) with open(file_name, 'r') as f: notice_xml = f.read() parser = etree.XMLParser(huge_tree=True) try: xml_tree = etree.fromstring(notice_xml, parser) except etree.XMLSyntaxError as e: print(colored('Syntax error in {}'.format(notice_file), 'red')) print(e) return doc_number = xml_tree.find( './{eregs}preamble/{eregs}documentNumber').text effective_date = xml_tree.find( './{eregs}preamble/{eregs}effectiveDate').text applies_to = xml_tree.find( './{eregs}changeset').get('leftDocumentNumber') if applies_to is None: # Major problem here print(colored("Error locating"), colored("leftDocumentNumber", attrs=['bold']), colored("attribute in"), colored("{}".format(doc_number), 'red', attrs=['bold'])) return regml_notices.append((doc_number, effective_date, applies_to, file_name)) if cfr_part in settings.CUSTOM_NOTICE_ORDER: order = settings.CUSTOM_NOTICE_ORDER[cfr_part] regml_notices.sort(key=lambda n: order.index(n[0])) else: regml_notices.sort(key=lambda n: n[1]) regs = [nn[2] for nn in regml_notices] regs.sort() # If no notices found, issue error message if not regml_notices: print(colored("\nNo available notices for reg {} in part {}".format(cfr_part, cfr_title))) return # If initial version is not findable, issue error message if regs[0] is None: print(colored("\nError reading initial version and apply order for reg {} in part {}. No changes have been made.".format(cfr_part, cfr_title), attrs=['bold'])) return # Generate prompt for user print(colored("\nAvailable notices for reg {}:".format(cfr_part), attrs=['bold'])) print("{:>3}. {:<22}(Initial version)".format(0, regs[0])) # Process notices found for kk in range(len(regml_notices)): print("{0:>3}. {1[0]:<22}(Effective: {1[1]})".format(kk+1, regml_notices[kk])) print() # Possible answers are blank (all), the numbers, or the notice names possible_indices = [str(kk) for kk in range(len(regml_notices) + 1)] possible_notices = [nn[0] for nn in regml_notices] # If notice number is supplied, use that one if through is not None: print("Command-line option selected notice '{}'".format(through)) answer = through else: # Get user input to specify end version answer = None while answer not in [""] + possible_indices + possible_notices: answer = raw_input('Press enter to apply all or enter notice number: [all] ') if len(answer) == 0: # Apply notices last_ver_idx = len(regml_notices) - 1 elif answer is "0": # Cancel - this is just the initial version print(colored("CANCELED: Version", attrs=['bold']), colored("{}".format(regs[0]), 'yellow', attrs=['bold']), colored("is the initial version - no changes have been made.", attrs=['bold'])) return elif answer in possible_indices: # Apply notices through answer-1 to adjust for the initial ver offset last_ver_idx = int(answer) - 1 elif answer in possible_notices: # Find index to stop at in notice list last_ver_idx = possible_notices.index(answer) else: print(colored("ERROR: Notice", attrs=['bold']), colored("{}".format(answer), 'red', attrs=['bold']), colored("does not exist - no changes have been made.", attrs=['bold'])) return print(colored("\nApplying notices through {0[0]}\n".format(regml_notices[last_ver_idx]), attrs=['bold'])) # Perform the notice application process reg_path = os.path.abspath(os.path.join(settings.XML_ROOT, 'regulation', cfr_part, '{}.xml'.format(regs[0]))) print("Opening initial version {}".format(reg_path)) regulation_file = find_file(reg_path) with open(regulation_file, 'r') as f: left_reg_xml = f.read() parser = etree.XMLParser(huge_tree=True) left_xml_tree = etree.fromstring(left_reg_xml, parser) kk = 1 prev_tree = left_xml_tree for notice in regml_notices[:last_ver_idx+1]: doc_number, effective_date, prev_notice, file_name = notice print("[{}] Applying notice {} from {} to version {}".format(kk, doc_number, file_name, prev_notice)) # Open the notice file notice_file = find_file(file_name, is_notice=True) with open(notice_file, 'r') as f: notice_string = f.read() parser = etree.XMLParser(huge_tree=True) notice_xml = etree.fromstring(notice_string, parser) # TODO: Validate labels for json-compliance? # Example: JSON fails on upload only for interpParagraphs without "Interp" in them # Validate the files regulation_validator = get_validator(prev_tree) terms_layer = build_terms_layer(prev_tree) try: notice_validator = get_validator(notice_xml, raise_instead_of_exiting=True) except Exception as e: print("[{}]".format(kk), colored("Exception occurred in notice", 'red'), colored(doc_number, attrs=['bold']), colored("; details are below. ", 'red'), "To retry this single notice, use:\n\n", colored("> ./regml.py apply-notice {0}/{1} {0}/{2}\n".format(cfr_part, prev_notice, doc_number), attrs=['bold'])) sys.exit(0) # validate the notice XML with the layers derived from the # tree of the previous version reload_notice = False skip_notices = list(skip_fix_notices) if skip_fix_notices_through is not None: if skip_fix_notices_through in possible_notices: last_fix_idx = possible_notices.index(skip_fix_notices_through) skip_notices.extend(possible_notices[:last_fix_idx + 1]) if fix_notices and doc_number not in skip_notices: print('Fixing notice number {}:'.format(doc_number)) notice_validator.validate_terms(notice_xml, terms_layer) notice_validator.validate_term_references(notice_xml, terms_layer, notice_file) notice_terms_layer = build_terms_layer(notice_xml) notice_validator.validate_term_references(notice_xml, notice_terms_layer, notice_file) notice_validator.fix_omitted_cites(notice_xml, notice_file) reload_notice = True # at this point the file has possibly changed, so we should really reload it if reload_notice: with open(notice_file, 'r') as f: notice_string = f.read() parser = etree.XMLParser(huge_tree=True) notice_xml = etree.fromstring(notice_string, parser) # Process the notice changeset try: new_xml_tree = process_changes(prev_tree, notice_xml) except Exception as e: print("[{}]".format(kk), colored("Exception occurred; details are below. ".format(kk), 'red'), "To retry this single notice, use:\n\n", colored("> ./regml.py apply-notice {0}/{1} {0}/{2}\n".format(cfr_part, prev_notice, doc_number), attrs=['bold'])) raise e # Add in any new analysis new_xml_tree = process_analysis(new_xml_tree, notice_xml) # Write the new xml tree new_xml_string = etree.tostring(new_xml_tree, pretty_print=True, xml_declaration=True, encoding='UTF-8') new_path = os.path.join( os.path.dirname(regulation_file), os.path.basename(notice_file)) with open(new_path, 'w') as f: print("[{}] Writing regulation to {}".format(kk, new_path)) f.write(new_xml_string) prev_tree = new_xml_tree kk += 1
def generate_json(regulation_file, check_terms=False): with open(find_file(regulation_file), 'r') as f: reg_xml = f.read() parser = etree.XMLParser(huge_tree=True) xml_tree = etree.fromstring(reg_xml, parser) # Validate the file relative to schema validator = get_validator(xml_tree) reg_tree = build_reg_tree(xml_tree) reg_number = reg_tree.label[0] paragraph_markers = build_paragraph_marker_layer(xml_tree) internal_citations = build_internal_citations_layer(xml_tree) external_citations = build_external_citations_layer(xml_tree) terms = build_terms_layer(xml_tree) meta = build_meta_layer(xml_tree) toc = build_toc_layer(xml_tree) keyterms = build_keyterm_layer(xml_tree) graphics = build_graphics_layer(xml_tree) formatting = build_formatting_layer(xml_tree) interps = build_interp_layer(xml_tree) analysis = build_analysis(xml_tree) notice_dict = build_notice(xml_tree) # if the validator had problems then we should report them and bail out validator.validate_terms(xml_tree, terms) validator.validate_internal_cites(xml_tree, internal_citations) if check_terms: validator.validate_term_references(xml_tree, terms, regulation_file) for event in validator.events: print(str(event)) reg_tree.include_children = True reg_json = reg_tree.to_json() notice = xml_tree.find('.//{eregs}documentNumber').text version = os.path.split(regulation_file)[-1].replace('.xml', '') if notice != version: print('Notice ({}) different from version ({}), ' 'using version'.format(notice, version)) notice = version write_layer(reg_json, reg_number, notice, 'regulation') write_layer(meta, reg_number, notice, 'layer/meta') write_layer(paragraph_markers, reg_number, notice, 'layer/paragraph-markers') write_layer(internal_citations, reg_number, notice, 'layer/internal-citations') write_layer(external_citations, reg_number, notice, 'layer/external-citations') write_layer(terms, reg_number, notice, 'layer/terms') write_layer(toc, reg_number, notice, 'layer/toc') write_layer(keyterms, reg_number, notice, 'layer/keyterms') write_layer(graphics, reg_number, notice, 'layer/graphics') write_layer(formatting, reg_number, notice, 'layer/formatting') write_layer(interps, reg_number, notice, 'layer/interpretations') write_layer(analysis, reg_number, notice, 'layer/analyses') write_layer(notice_dict, reg_number, notice, 'notice') return reg_number, notice, xml_tree
def parser_driver(regulation_file, check_terms=False, correct_interps=False, headerize_interps=False, fix_missed_cites=False): with open(regulation_file, 'r') as f: reg_xml = f.read() xml_tree = etree.fromstring(reg_xml) # validate relative to schema validator = EregsValidator(settings.XSD_FILE) validator.validate_reg(xml_tree) if not validator.is_valid: for event in validator.events: print(str(event)) sys.exit(0) reg_tree = build_reg_tree(xml_tree) reg_number = reg_tree.label[0] # we can correct interps right away if necessary if correct_interps: validator.insert_interp_markers(xml_tree, regulation_file) if headerize_interps: validator.headerize_interps(xml_tree, regulation_file) if fix_missed_cites: validator.fix_omitted_cites(xml_tree, regulation_file) paragraph_markers = build_paragraph_marker_layer(xml_tree) internal_citations = build_internal_citations_layer(xml_tree) external_citations = build_external_citations_layer(xml_tree) terms = build_terms_layer(xml_tree) meta = build_meta_layer(xml_tree) toc = build_toc_layer(xml_tree) keyterms = build_keyterm_layer(xml_tree) graphics = build_graphics_layer(xml_tree) formatting = build_formatting_layer(xml_tree) interps = build_interp_layer(xml_tree) analysis = build_analysis(xml_tree) notice_dict = build_notice(xml_tree) # if the validator had problems then we should report them and bail out validator.validate_terms(xml_tree, terms) validator.validate_internal_cites(xml_tree, internal_citations) if check_terms: validator.validate_term_references(xml_tree, terms, regulation_file) for event in validator.events: print(str(event)) reg_tree.include_children = True reg_json = reg_tree.to_json() notice = xml_tree.find('.//{eregs}documentNumber').text version = os.path.split(regulation_file)[-1].replace('.xml', '') if notice != version: print('Notice ({}) different from version ({}), using version'.format(notice, version)) notice = version write_layer(reg_json, reg_number, notice, 'regulation') write_layer(meta, reg_number, notice, 'layer/meta') write_layer(paragraph_markers, reg_number, notice, 'layer/paragraph-markers') write_layer(internal_citations, reg_number, notice, 'layer/internal-citations') write_layer(external_citations, reg_number, notice, 'layer/external-citations') write_layer(terms, reg_number, notice, 'layer/terms') write_layer(toc, reg_number, notice, 'layer/toc') write_layer(keyterms, reg_number, notice, 'layer/keyterms') write_layer(graphics, reg_number, notice, 'layer/graphics') write_layer(formatting, reg_number, notice, 'layer/formatting') write_layer(interps, reg_number, notice, 'layer/interpretations') write_layer(analysis, reg_number, notice, 'layer/analyses') write_layer(notice_dict, reg_number, notice, 'notice')