Python build_terms_layerの例、regulation.tree.build_terms_layer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: regulation_tree_tests.py プロジェクト: cfpb/regulations-xml-parser

    def test_section_intro_references(self):
        reg_xml = etree.fromstring("""
        <section label="1024-3" sectionNum="3" xmlns="eregs">
            <subject>§ 1024.3 Questions or suggestions from public and copies of public guidance documents.</subject>
            <paragraph label="1024-3-p1" marker="">
              <content>Any questions regarding <ref target="1024-defs" reftype="term">RESPA</ref>.
              </content>
            </paragraph>
            <paragraph label="1024-defs" marker="">
              <content>This paragraph contains references for the term <def term="respa">RESPA</def>.
              </content>
            </paragraph>
          </section>""")
        result = build_terms_layer(reg_xml)

        # This paragraph is an intro paragraph, so for reg-site the content gets pushed into the section's text area
        # Therefore, the terms layer should have the reference for the section's label, not the paragraph's label
        expected_result = OrderedDict([('1024-3', 
                                        [OrderedDict([(u'offsets', [[24, 29]]),
                                                      (u'ref', u'respa:1024-defs')])]),
                                       (u'referenced', 
                                        OrderedDict([(u'respa:1024-defs', 
                                        OrderedDict([(u'position', [48, 53]),
                                                     (u'reference', '1024-defs'),
                                                     (u'term', 'respa')]))]))])

        self.assertEqual(expected_result, result)

コード例 #2

0

ファイルを表示

ファイル: regml.py プロジェクト: cfpb/regulations-xml-parser

def validate(file, no_terms=False, no_citations=False, no_keyterms=False):
    """ Validate a RegML file """
    file = find_file(file)
    with open(file, 'r') as f:
        reg_xml = f.read()
    parser = etree.XMLParser(huge_tree=True)
    xml_tree = etree.fromstring(reg_xml, parser)

    # Validate the file relative to schema
    validator = get_validator(xml_tree)

    # Validate regulation-specific documents
    if xml_tree.tag == '{eregs}regulation':
        terms = build_terms_layer(xml_tree)
        internal_citations = build_internal_citations_layer(xml_tree)

        if not no_terms:
            validator.validate_terms(xml_tree, terms)
        if not no_citations:
            validator.validate_internal_cites(xml_tree, internal_citations)
        if not no_keyterms:
            validator.validate_keyterms(xml_tree)

        for event in validator.events:
            print(str(event))

    # Validate notice-specific documents
    if xml_tree.tag == '{eregs}notice':
        pass

    return validator

コード例 #3

0

ファイルを表示

ファイル: regulation_tree_tests.py プロジェクト: cfpb/regulations-xml-parser

    def test_appendix_intro_references(self):
        reg_xml = etree.fromstring("""
        <appendixSection appendixSecNum="1" label="1024-B-s1" xmlns="eregs">
          <subject/>
          <paragraph label="1024-B-p1-0" marker="">
            <content>The following illustrations provide provisions of <ref target="1024-defs" reftype="term">RESPA</ref>.
            </content>
          </paragraph>
          <paragraph label="1024-B-p1-1" marker="">
            <content>Refer to the <ref target="1024-defs" reftype="term">Bureau</ref>'s regulations for <ref target="1024-defs" reftype="term">HUD-1</ref>.
            </content>
          </paragraph>
          <paragraph label="1024-defs" marker="">
            <content>This paragraph contains terms <def term="bureau">Bureau</def>, <def term="respa">RESPA</def>, and <def term="hud-1">HUD-1</def>.
            </content>
          </paragraph>
        </appendixSection>""")
        result = build_terms_layer(reg_xml)

        # This paragraph is an intro paragraph, so for reg-site the content gets pushed into the appendixSection text
        # Therefore, the terms layer should have the reference for the appendixSection's label, not the paragraph label
        # This also checks that only the first paragraph becomes an intro paragraph.
        expected_result = OrderedDict([('1024-B-s1', 
                                [OrderedDict([(u'offsets', [[50, 55]]), 
                                              (u'ref', u'bureau:1024-defs')])]),
                               ('1024-B-p1-1', 
                                [OrderedDict([(u'offsets', [[13, 19]]),
                                              (u'ref', u'bureau:1024-defs')]), 
                                 OrderedDict([(u'offsets', [[38, 43]]),
                                              (u'ref', u'bureau:1024-defs')])]),
                               (u'referenced', OrderedDict([
                                                    (u'bureau:1024-defs', 
                                                     OrderedDict([(u'position', [30, 36]),
                                                                  (u'reference', '1024-defs'),
                                                                  (u'term', 'bureau')])),
                                                    (u'respa:1024-defs', 
                                                     OrderedDict([(u'position', [38, 43]),
                                                                  (u'reference', '1024-defs'),
                                                                  (u'term', 'respa')])),
                                                    (u'hud-1:1024-defs',
                                                     OrderedDict([(u'position', [49, 54]),
                                                                  (u'reference', '1024-defs'),
                                                                  (u'term', 'hud-1')]))]))])

        self.assertEqual(expected_result, result)

コード例 #4

0

ファイルを表示

ファイル: regml.py プロジェクト: ascott1/regulations-xml-parser

def check_terms(file, label=None, term=None):
    """ Check the terms in a RegML file """

    file = find_file(file)
    with open(file, 'r') as f:
        reg_xml = f.read()
    xml_tree = etree.fromstring(reg_xml)

    if xml_tree.tag == '{eregs}notice':
        print("Cannot check terms in notice files")
        sys.exit(1)

    # Validate the file relative to schema
    validator = get_validator(xml_tree)

    terms = build_terms_layer(xml_tree)
    validator.validate_terms(xml_tree, terms)
    validator.validate_term_references(xml_tree, terms, file,
            label=label, term=term)

コード例 #5

0

ファイルを表示

ファイル: regulation_tree_tests.py プロジェクト: cfpb/regulations-xml-parser

    def test_para_with_defs_offsets(self):
        reg_xml = etree.fromstring("""
        <appendixSection appendixSecNum="1" label="1024-s1" xmlns="eregs">
          <subject/>
          <paragraph label="1024-defs" marker="1.">
            <title type="keyterm">Definitions.</title>
            <content>This paragraph contains definitions to check offsets, like <def term="bureau">Bureau</def>.
            </content>
          </paragraph>
        </appendixSection>""")
        result = build_terms_layer(reg_xml)

        # This paragraph is a paragraph with definitions and a title (type: keyterm) to test
        # that the appropriate offsets are calculated for both marker and title.
        expected_result = OrderedDict([(u'referenced', 
                                        OrderedDict([(u'bureau:1024-defs',
                                                      OrderedDict([(u'position', [74, 80]),
                                                                   (u'reference', '1024-defs'),
                                                                   (u'term', 'bureau')]))])
                                        )])

        self.assertEqual(expected_result, result)

コード例 #6

0

ファイルを表示

ファイル: regml.py プロジェクト: cfpb/regulations-xml-parser

def check_terms(file, label=None, term=None, with_notice=None):
    """ Check the terms in a RegML file """

    file = find_file(file)
    with open(file, 'r') as f:
        reg_string = f.read()
    parser = etree.XMLParser(huge_tree=True)
    reg_tree = etree.fromstring(reg_string, parser)

    if reg_tree.tag == '{eregs}notice':
        print("Cannot check terms in notice files directly.")
        print("Use a regulation file and --with-notice to specify the notice that applies.")
        sys.exit(1)

    # If we're given a notice, apply it to the given regulation file,
    # then check terms in the result and write it out to the notice file
    # as changes.
    notice_tree = None
    if with_notice is not None:
        # file is changed here so the term checker will write the notice
        # instead of the regulation
        file = find_file(with_notice, is_notice=True)
        with open(file, 'r') as f:
            notice_xml = f.read()
        notice_tree = etree.fromstring(notice_xml)

        # Process the notice changeset
        print(colored('Applying notice...', attrs=['bold']))
        reg_tree = process_changes(reg_tree, notice_tree)

    # Validate the file relative to schema
    validator = get_validator(reg_tree)

    terms = build_terms_layer(reg_tree)
    validator.validate_terms(reg_tree, terms)
    validator.validate_term_references(reg_tree, terms, file,
            label=label, term=term, notice=notice_tree)

コード例 #7

0

ファイルを表示

ファイル: regml.py プロジェクト: cfpb/regulations-xml-parser

def apply_through(cfr_title, cfr_part, start=None, through=None,
                  fix_notices=False, skip_fix_notices=[],
                  skip_fix_notices_through=None):
    # Get list of notices that apply to this reg
    # Look for locally available notices
    regml_notice_files = find_all(cfr_part, is_notice=True)

    regml_notices = []
    for notice_file in regml_notice_files:
        file_name = os.path.join(notice_file)
        with open(file_name, 'r') as f:
            notice_xml = f.read()
        parser = etree.XMLParser(huge_tree=True)

        try:
            xml_tree = etree.fromstring(notice_xml, parser)
        except etree.XMLSyntaxError as e:
            print(colored('Syntax error in {}'.format(notice_file), 'red'))
            print(e)
            return

        doc_number = xml_tree.find(
            './{eregs}preamble/{eregs}documentNumber').text
        effective_date = xml_tree.find(
            './{eregs}preamble/{eregs}effectiveDate').text
        applies_to = xml_tree.find(
            './{eregs}changeset').get('leftDocumentNumber')
        if applies_to is None:
            # Major problem here
            print(colored("Error locating"),
                  colored("leftDocumentNumber", attrs=['bold']),
                  colored("attribute in"),
                  colored("{}".format(doc_number), 'red',
                          attrs=['bold']))
            return

        regml_notices.append((doc_number, effective_date, applies_to, file_name))

    if cfr_part in settings.CUSTOM_NOTICE_ORDER:
        order = settings.CUSTOM_NOTICE_ORDER[cfr_part]
        regml_notices.sort(key=lambda n: order.index(n[0]))

    else:
        regml_notices.sort(key=lambda n: n[1])
    
    regs = [nn[2] for nn in regml_notices]
    regs.sort()

    # If no notices found, issue error message
    if not regml_notices:
        print(colored("\nNo available notices for reg {} in part {}".format(cfr_part, cfr_title)))
        return

    # If initial version is not findable, issue error message
    if regs[0] is None:
        print(colored("\nError reading initial version and apply order for reg {} in part {}. No changes have been made.".format(cfr_part, cfr_title),
                      attrs=['bold']))
        return

    # Generate prompt for user
    print(colored("\nAvailable notices for reg {}:".format(cfr_part),
          attrs=['bold']))
    print("{:>3}. {:<22}(Initial version)".format(0, regs[0]))
    # Process notices found
    for kk in range(len(regml_notices)):
        print("{0:>3}. {1[0]:<22}(Effective: {1[1]})".format(kk+1,
                                               regml_notices[kk]))
    print()

    # Possible answers are blank (all), the numbers, or the notice names
    possible_indices = [str(kk) for kk in range(len(regml_notices) + 1)]
    possible_notices = [nn[0] for nn in regml_notices]

    # If notice number is supplied, use that one
    if through is not None:
        print("Command-line option selected notice '{}'".format(through))
        answer = through
    else:
        # Get user input to specify end version
        answer = None
        while answer not in [""] + possible_indices + possible_notices:
            answer = raw_input('Press enter to apply all or enter notice number: [all] ')

    if len(answer) == 0:
        # Apply notices
        last_ver_idx = len(regml_notices) - 1
    elif answer is "0":
        # Cancel - this is just the initial version
        print(colored("CANCELED: Version", attrs=['bold']),
              colored("{}".format(regs[0]), 'yellow', attrs=['bold']),
              colored("is the initial version - no changes have been made.", attrs=['bold']))
        return
    elif answer in possible_indices:
        # Apply notices through answer-1 to adjust for the initial ver offset
        last_ver_idx = int(answer) - 1
    elif answer in possible_notices:
        # Find index to stop at in notice list
        last_ver_idx = possible_notices.index(answer)
    else:
        print(colored("ERROR: Notice", attrs=['bold']),
              colored("{}".format(answer), 'red', attrs=['bold']),
              colored("does not exist - no changes have been made.", attrs=['bold']))
        return

    print(colored("\nApplying notices through {0[0]}\n".format(regml_notices[last_ver_idx]),
          attrs=['bold']))

    # Perform the notice application process
    reg_path = os.path.abspath(os.path.join(settings.XML_ROOT,
                                            'regulation',
                                            cfr_part,
                                            '{}.xml'.format(regs[0])))
    print("Opening initial version {}".format(reg_path))
    regulation_file = find_file(reg_path)
    with open(regulation_file, 'r') as f:
        left_reg_xml = f.read()
    parser = etree.XMLParser(huge_tree=True)
    left_xml_tree = etree.fromstring(left_reg_xml, parser)

    kk = 1
    prev_tree = left_xml_tree
    for notice in regml_notices[:last_ver_idx+1]:
        doc_number, effective_date, prev_notice, file_name = notice

        print("[{}] Applying notice {} from {} to version {}".format(kk,
                                                                     doc_number,
                                                                     file_name,
                                                                     prev_notice))

        # Open the notice file
        notice_file = find_file(file_name, is_notice=True)
        with open(notice_file, 'r') as f:
            notice_string = f.read()
        parser = etree.XMLParser(huge_tree=True)

        notice_xml = etree.fromstring(notice_string, parser)

        # TODO: Validate labels for json-compliance?
        # Example: JSON fails on upload only for interpParagraphs without "Interp" in them

        # Validate the files
        regulation_validator = get_validator(prev_tree)
        terms_layer = build_terms_layer(prev_tree)

        try:
            notice_validator = get_validator(notice_xml, raise_instead_of_exiting=True)
        except Exception as e:
            print("[{}]".format(kk),
                  colored("Exception occurred in notice", 'red'),
                  colored(doc_number, attrs=['bold']),
                  colored("; details are below. ", 'red'),
                  "To retry this single notice, use:\n\n",
                  colored("> ./regml.py apply-notice {0}/{1} {0}/{2}\n".format(cfr_part,
                                                                               prev_notice,
                                                                               doc_number),
                          attrs=['bold']))
            sys.exit(0)

        # validate the notice XML with the layers derived from the
        # tree of the previous version
        reload_notice = False
        skip_notices = list(skip_fix_notices)

        if skip_fix_notices_through is not None:
            if skip_fix_notices_through in possible_notices:
                last_fix_idx = possible_notices.index(skip_fix_notices_through)
                skip_notices.extend(possible_notices[:last_fix_idx + 1])

        if fix_notices and doc_number not in skip_notices:
            print('Fixing notice number {}:'.format(doc_number))
            notice_validator.validate_terms(notice_xml, terms_layer)
            notice_validator.validate_term_references(notice_xml, terms_layer, notice_file)
            notice_terms_layer = build_terms_layer(notice_xml)
            notice_validator.validate_term_references(notice_xml, notice_terms_layer, notice_file)
            notice_validator.fix_omitted_cites(notice_xml, notice_file)
            reload_notice = True

        # at this point the file has possibly changed, so we should really reload it
        if reload_notice:
            with open(notice_file, 'r') as f:
                notice_string = f.read()
            parser = etree.XMLParser(huge_tree=True)

            notice_xml = etree.fromstring(notice_string, parser)

        # Process the notice changeset
        try:
            new_xml_tree = process_changes(prev_tree, notice_xml)
        except Exception as e:
            print("[{}]".format(kk),
                  colored("Exception occurred; details are below. ".format(kk), 'red'),
                  "To retry this single notice, use:\n\n",
                  colored("> ./regml.py apply-notice {0}/{1} {0}/{2}\n".format(cfr_part,
                                                                               prev_notice,
                                                                               doc_number),
                          attrs=['bold']))
            raise e

        # Add in any new analysis
        new_xml_tree = process_analysis(new_xml_tree, notice_xml)

        # Write the new xml tree
        new_xml_string = etree.tostring(new_xml_tree,
                                        pretty_print=True,
                                        xml_declaration=True,
                                        encoding='UTF-8')
        new_path = os.path.join(
            os.path.dirname(regulation_file),
            os.path.basename(notice_file))
        with open(new_path, 'w') as f:
            print("[{}] Writing regulation to {}".format(kk, new_path))
            f.write(new_xml_string)

        prev_tree = new_xml_tree
        kk += 1

コード例 #8

0

ファイルを表示

ファイル: regml.py プロジェクト: cfpb/regulations-xml-parser

def generate_json(regulation_file, check_terms=False):
    with open(find_file(regulation_file), 'r') as f:
        reg_xml = f.read()
    parser = etree.XMLParser(huge_tree=True)
    xml_tree = etree.fromstring(reg_xml, parser)

    # Validate the file relative to schema
    validator = get_validator(xml_tree)

    reg_tree = build_reg_tree(xml_tree)
    reg_number = reg_tree.label[0]

    paragraph_markers = build_paragraph_marker_layer(xml_tree)
    internal_citations = build_internal_citations_layer(xml_tree)
    external_citations = build_external_citations_layer(xml_tree)
    terms = build_terms_layer(xml_tree)
    meta = build_meta_layer(xml_tree)
    toc = build_toc_layer(xml_tree)
    keyterms = build_keyterm_layer(xml_tree)
    graphics = build_graphics_layer(xml_tree)
    formatting = build_formatting_layer(xml_tree)
    interps = build_interp_layer(xml_tree)
    analysis = build_analysis(xml_tree)
    notice_dict = build_notice(xml_tree)

    # if the validator had problems then we should report them and bail out

    validator.validate_terms(xml_tree, terms)
    validator.validate_internal_cites(xml_tree, internal_citations)
    if check_terms:
        validator.validate_term_references(xml_tree, terms, regulation_file)
    for event in validator.events:
        print(str(event))

    reg_tree.include_children = True
    reg_json = reg_tree.to_json()

    notice = xml_tree.find('.//{eregs}documentNumber').text
    version = os.path.split(regulation_file)[-1].replace('.xml', '')
    if notice != version:
        print('Notice ({}) different from version ({}), '
              'using version'.format(notice, version))
        notice = version

    write_layer(reg_json, reg_number, notice, 'regulation')
    write_layer(meta, reg_number, notice, 'layer/meta')
    write_layer(paragraph_markers, reg_number, notice,
                'layer/paragraph-markers')
    write_layer(internal_citations, reg_number, notice,
                'layer/internal-citations')
    write_layer(external_citations, reg_number, notice,
                'layer/external-citations')
    write_layer(terms, reg_number, notice, 'layer/terms')
    write_layer(toc, reg_number, notice, 'layer/toc')
    write_layer(keyterms, reg_number, notice, 'layer/keyterms')
    write_layer(graphics, reg_number, notice, 'layer/graphics')
    write_layer(formatting, reg_number, notice, 'layer/formatting')
    write_layer(interps, reg_number, notice, 'layer/interpretations')
    write_layer(analysis, reg_number, notice, 'layer/analyses')
    write_layer(notice_dict, reg_number, notice, 'notice')

    return reg_number, notice, xml_tree

コード例 #9

0

ファイルを表示

ファイル: reg_xml_parser.py プロジェクト: ascott1/regulations-xml-parser

def parser_driver(regulation_file,
                  check_terms=False,
                  correct_interps=False,
                  headerize_interps=False,
                  fix_missed_cites=False):
    with open(regulation_file, 'r') as f:
        reg_xml = f.read()
    xml_tree = etree.fromstring(reg_xml)

    # validate relative to schema
    validator = EregsValidator(settings.XSD_FILE)
    validator.validate_reg(xml_tree)

    if not validator.is_valid:
        for event in validator.events:
            print(str(event))
        sys.exit(0)

    reg_tree = build_reg_tree(xml_tree)
    reg_number = reg_tree.label[0]
    # we can correct interps right away if necessary
    if correct_interps:
        validator.insert_interp_markers(xml_tree, regulation_file)
    if headerize_interps:
        validator.headerize_interps(xml_tree, regulation_file)
    if fix_missed_cites:
        validator.fix_omitted_cites(xml_tree, regulation_file)

    paragraph_markers = build_paragraph_marker_layer(xml_tree)
    internal_citations = build_internal_citations_layer(xml_tree)
    external_citations = build_external_citations_layer(xml_tree)
    terms = build_terms_layer(xml_tree)
    meta = build_meta_layer(xml_tree)
    toc = build_toc_layer(xml_tree)
    keyterms = build_keyterm_layer(xml_tree)
    graphics = build_graphics_layer(xml_tree)
    formatting = build_formatting_layer(xml_tree)
    interps = build_interp_layer(xml_tree)
    analysis = build_analysis(xml_tree)
    notice_dict = build_notice(xml_tree)

    # if the validator had problems then we should report them and bail out

    validator.validate_terms(xml_tree, terms)
    validator.validate_internal_cites(xml_tree, internal_citations)
    if check_terms:
        validator.validate_term_references(xml_tree, terms, regulation_file)
    for event in validator.events:
        print(str(event))

    reg_tree.include_children = True
    reg_json = reg_tree.to_json()

    notice = xml_tree.find('.//{eregs}documentNumber').text
    version = os.path.split(regulation_file)[-1].replace('.xml', '')
    if notice != version:
        print('Notice ({}) different from version ({}), using version'.format(notice, version))
        notice = version

    write_layer(reg_json, reg_number, notice, 'regulation')
    write_layer(meta, reg_number, notice, 'layer/meta')
    write_layer(paragraph_markers, reg_number, notice,
                'layer/paragraph-markers')
    write_layer(internal_citations, reg_number, notice,
                'layer/internal-citations')
    write_layer(external_citations, reg_number, notice,
                'layer/external-citations')
    write_layer(terms, reg_number, notice, 'layer/terms')
    write_layer(toc, reg_number, notice, 'layer/toc')
    write_layer(keyterms, reg_number, notice, 'layer/keyterms')
    write_layer(graphics, reg_number, notice, 'layer/graphics')
    write_layer(formatting, reg_number, notice, 'layer/formatting')
    write_layer(interps, reg_number, notice, 'layer/interpretations')
    write_layer(analysis, reg_number, notice, 'layer/analyses')
    write_layer(notice_dict, reg_number, notice, 'notice')