Ejemplo n.º 1
0
    def paragraph_with_marker(self, text, tagged_text):
        """The paragraph has a marker, like (a) or a. etc."""
        # To aid in determining collapsed paragraphs, replace any
        # keyterms present
        node_for_keyterms = Node(text, node_type=Node.APPENDIX)
        node_for_keyterms.tagged_text = tagged_text
        node_for_keyterms.label = [initial_marker(text)[0]]
        keyterm = KeyTerms.get_keyterm(node_for_keyterms)
        if keyterm:
            mtext = text.replace(keyterm, ';'*len(keyterm))
        else:
            mtext = text

        for mtext in split_paragraph_text(mtext):
            if keyterm:     # still need the original text
                mtext = mtext.replace(';'*len(keyterm), keyterm)
            # label_candidate = [initial_marker(mtext)[0]]
            # existing_node = None
            # for node in self.nodes:
            #     if node.label == label_candidate:
            #         existing_node = node
            # if existing_node:
            #     self.paragraph_counter += 1
            #     node = Node(mtext, node_type=Node.APPENDIX,
            #                 label=['dup{}'.format(self.paragraph_counter),
            #                        initial_marker(mtext)[0]])
            # else:
            node = Node(mtext, node_type=Node.APPENDIX,
                        label=[initial_marker(mtext)[0]])
            node.tagged_text = tagged_text
            self.nodes.append(node)
Ejemplo n.º 2
0
    def paragraph_with_marker(self, text, tagged_text):
        """The paragraph has a marker, like (a) or a. etc."""
        # To aid in determining collapsed paragraphs, replace any
        # keyterms present
        node_for_keyterms = Node(text, node_type=Node.APPENDIX)
        node_for_keyterms.tagged_text = tagged_text
        node_for_keyterms.label = [initial_marker(text)[0]]
        keyterm = KeyTerms.get_keyterm(node_for_keyterms)
        if keyterm:
            mtext = text.replace(keyterm, ';' * len(keyterm))
        else:
            mtext = text

        for mtext in split_paragraph_text(mtext):
            if keyterm:  # still need the original text
                mtext = mtext.replace(';' * len(keyterm), keyterm)
            # label_candidate = [initial_marker(mtext)[0]]
            # existing_node = None
            # for node in self.nodes:
            #     if node.label == label_candidate:
            #         existing_node = node
            # if existing_node:
            #     self.paragraph_counter += 1
            #     node = Node(mtext, node_type=Node.APPENDIX,
            #                 label=['dup{}'.format(self.paragraph_counter),
            #                        initial_marker(mtext)[0]])
            # else:
            node = Node(mtext,
                        node_type=Node.APPENDIX,
                        label=[initial_marker(mtext)[0]])
            node.tagged_text = tagged_text
            self.nodes.append(node)
    def test_dict_to_node(self):
        dict_node = {
            'text': 'node text',
            'label': ['205', 'A'],
            'node_type': 'appendix'}

        node = compiler.dict_to_node(dict_node)

        self.assertEqual(
            node,
            Node('node text', [], ['205', 'A'], None, 'appendix'))

        dict_node['tagged_text'] = '<E> Tagged </E> text.'

        node = compiler.dict_to_node(dict_node)

        actual_node = Node('node text', [], ['205', 'A'], None, 'appendix')
        actual_node.tagged_text = '<E> Tagged </E> text.'

        created_node = compiler.dict_to_node(dict_node)

        self.assertEqual(actual_node, created_node)
        self.assertEqual(actual_node.tagged_text, created_node.tagged_text)

        dict_node = {
            'text': 'node text'
        }

        node = compiler.dict_to_node(dict_node)
        self.assertEqual(node, dict_node)
    def test_dict_to_node(self):
        dict_node = {
            'text': 'node text',
            'label': ['205', 'A'],
            'node_type': 'appendix'}

        node = compiler.dict_to_node(dict_node)

        self.assertEqual(
            node,
            Node('node text', [], ['205', 'A'], None, 'appendix'))

        dict_node['tagged_text'] = '<E> Tagged </E> text.'

        node = compiler.dict_to_node(dict_node)

        actual_node = Node('node text', [], ['205', 'A'], None, 'appendix')
        actual_node.tagged_text = '<E> Tagged </E> text.'

        created_node = compiler.dict_to_node(dict_node)

        self.assertEqual(actual_node, created_node)
        self.assertEqual(actual_node.tagged_text, created_node.tagged_text)

        dict_node = {
            'text': 'node text'
        }

        node = compiler.dict_to_node(dict_node)
        self.assertEqual(node, dict_node)
def collapsed_markers_matches(node_text, tagged_text):
    """Find collapsed markers, i.e. tree node paragraphs that begin within a
    single XML node, within this text. Remove citations and other false
    positives. This is pretty hacky right now -- it focuses on the plain
    text but takes cues from the tagged text. @todo: streamline logic"""
    # In addition to the regex above, keyterms are an acceptable prefix. We
    # therefore convert keyterms to satisfy the above regex
    node_for_keyterms = Node(node_text, node_type=Node.INTERP,
                             label=[get_first_interp_marker(node_text)])
    node_for_keyterms.tagged_text = tagged_text
    keyterm = KeyTerms.get_keyterm(node_for_keyterms)
    if keyterm:
        node_text = node_text.replace(keyterm, '.'*len(keyterm))

    collapsed_markers = []
    for marker in _first_markers:
        possible = ((m, m.start(), m.end())
                    for m in marker.finditer(node_text) if m.start() > 0)
        possible = remove_citation_overlaps(node_text, possible)
        # If certain characters follow, kill it
        for following in ("e.", ")", u"”", '"', "'"):
            possible = [(m, s, end) for m, s, end in possible
                        if not node_text[end:].startswith(following)]
        possible = [m for m, _, _ in possible]
        # As all "1." collapsed markers must be emphasized, run a quick
        # check to weed out some false positives
        if '<E T="03">1' not in tagged_text:
            possible = filter(lambda m: m.group(1) != '1', possible)
        collapsed_markers.extend(possible)
    return collapsed_markers
    def test_keyterm_is_first_not_first(self):
        node = Node('(a) This has a list: apples et seq.',
                    label=['101', '22', 'a'])
        node.tagged_text = '(a) This has a list: apples <E T="03">et seq.</E>'

        kt = KeyTerms(None)
        self.assertFalse(kt.keyterm_is_first(node, 'et seq.'))
    def test_keyterm_definition(self):
        node = Node("(a) Terminator means I'll be back",
                    label=['101', '22', 'a'])
        node.tagged_text = """(a) <E T="03">Terminator</E> means I'll be """
        node.tagged_text += 'back'
        kt = KeyTerms(None)
        results = kt.process(node)
        self.assertEqual(results, None)

        node = Node("(1) Act means pretend", label=['101', '22', 'a', '1'])
        node.tagged_text = """(1) <E T="03">Act</E> means pretend"""
        node = Node("(1) Act means the Truth in Lending Act (15 U.S.C. 1601 et seq.).", label=['1026', '2', 'a', '1'])
        node.tagged_text = """(1) <E T="03">Act</E> means the Truth in Lending Act (15 U.S.C. 1601 <E T="03">et seq.</E>)."""
        kt = KeyTerms(None)
        results = kt.process(node)
        self.assertEqual(results, None)
 def test_no_keyterm(self):
     node = Node('(a) Apples are grown in New Zealand.',
                 label=['101', '22', 'a'])
     node.tagged_text = '(a) Apples are grown in New Zealand.'
     kt = KeyTerms(None)
     results = kt.process(node)
     self.assertEquals(results, None)
def collapsed_markers_matches(node_text, tagged_text):
    """Find collapsed markers, i.e. tree node paragraphs that begin within a
    single XML node, within this text. Remove citations and other false
    positives. This is pretty hacky right now -- it focuses on the plain
    text but takes cues from the tagged text. @todo: streamline logic"""
    # In addition to the regex above, keyterms are an acceptable prefix. We
    # therefore convert keyterms to satisfy the above regex
    node_for_keyterms = Node(node_text, node_type=Node.INTERP,
                             label=[get_first_interp_marker(node_text)])
    node_for_keyterms.tagged_text = tagged_text
    keyterm = KeyTerms.keyterm_in_node(node_for_keyterms)
    if keyterm:
        node_text = node_text.replace(keyterm, '.' * len(keyterm))

    collapsed_markers = []
    for marker in _first_markers:
        possible = [(m, m.start(), m.end())
                    for m in marker.finditer(node_text)]
        possible = remove_citation_overlaps(node_text, possible)
        possible = [triplet[0] for triplet in possible]
        collapsed_markers.extend(
            match for match in possible
            if not false_collapsed_marker(match, node_text, tagged_text)
        )
    return collapsed_markers
def nodes_from_interp_p(xml_node):
    """Given an XML node that contains text for an interpretation paragraph,
    split it into sub-paragraphs and account for trailing stars"""
    node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
    first_marker = get_first_interp_marker(text_with_tags)
    collapsed = collapsed_markers_matches(node_text, text_with_tags)

    #   -2 throughout to account for matching the character + period
    ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
    starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

    #   Node for this paragraph
    n = Node(node_text[0:starts[0]], label=[first_marker],
             node_type=Node.INTERP)
    n.tagged_text = text_with_tags
    yield n
    if n.text.endswith('* * *'):
        yield Node(label=[mtypes.INLINE_STARS])

    #   Collapsed-marker children
    for match, end in zip(collapsed, ends):
        marker = match.group(1)
        if marker == '1':
            marker = '<E T="03">1</E>'
        n = Node(node_text[match.end() - 2:end], label=[marker],
                 node_type=Node.INTERP)
        yield n
        if n.text.endswith('* * *'):
            yield Node(label=[mtypes.INLINE_STARS])
Ejemplo n.º 11
0
def collapsed_markers_matches(node_text, tagged_text):
    """Find collapsed markers, i.e. tree node paragraphs that begin within a
    single XML node, within this text. Remove citations and other false
    positives. This is pretty hacky right now -- it focuses on the plain
    text but takes cues from the tagged text. @todo: streamline logic"""
    # In addition to the regex above, keyterms are an acceptable prefix. We
    # therefore convert keyterms to satisfy the above regex
    node_for_keyterms = Node(node_text, node_type=Node.INTERP,
                             label=[get_first_interp_marker(node_text)])
    node_for_keyterms.tagged_text = tagged_text
    keyterm = KeyTerms.get_keyterm(node_for_keyterms)
    if keyterm:
        node_text = node_text.replace(keyterm, '.'*len(keyterm))

    collapsed_markers = []
    for marker in _first_markers:
        possible = ((m, m.start(), m.end())
                    for m in marker.finditer(node_text) if m.start() > 0)
        possible = remove_citation_overlaps(node_text, possible)
        # If certain characters follow, kill it
        for following in ("e.", ")", u"”", '"', "'"):
            possible = [(m, s, end) for m, s, end in possible
                        if not node_text[end:].startswith(following)]
        possible = [m for m, _, _ in possible]
        # As all "1." collapsed markers must be emphasized, run a quick
        # check to weed out some false positives
        if '<E T="03">1' not in tagged_text:
            possible = filter(lambda m: m.group(1) != '1', possible)
        collapsed_markers.extend(possible)
    return collapsed_markers
    def test_keyterm_definition(self):
        node = Node("(a) Terminator means I'll be back",
                    label=['101', '22', 'a'])
        node.tagged_text = """(a) <E T="03">Terminator</E> means I'll be """
        node.tagged_text += 'back'
        kt = KeyTerms(None)
        results = kt.process(node)
        self.assertEqual(results, None)

        node = Node("(1) Act means pretend", label=['101', '22', 'a', '1'])
        node.tagged_text = """(1) <E T="03">Act</E> means pretend"""
        node = Node(
            "(1) Act means the Truth in Lending Act (15 U.S.C. 1601 et seq.).",
            label=['1026', '2', 'a', '1'])
        node.tagged_text = """(1) <E T="03">Act</E> means the Truth in Lending Act (15 U.S.C. 1601 <E T="03">et seq.</E>)."""
        kt = KeyTerms(None)
        results = kt.process(node)
        self.assertEqual(results, None)
    def test_keyterm_see(self):
        """ Keyterm tags sometimes enclose phrases such as 'See also' because
        those tags are also used for emphasis. """

        node = Node('(a) Apples. See Section 101.2', label=['101', '22', 'a'])
        node.tagged_text = '(a) <E T="03">Apples. See also</E>'

        kt = KeyTerms(None)
        results = kt.process(node)
        self.assertEqual('Apples.', results[0]['key_term'])
 def test_keyterm_and_emphasis(self):
     node = Node('(a) Apples. Apples are grown in '
                 + 'New Zealand.', label=['101', '22', 'a'])
     node.tagged_text = '(a) <E T="03">Apples.</E> Apples are grown in ' +\
         'New <E T="03">Zealand.</E>'
     kt = KeyTerms(None)
     results = kt.process(node)
     self.assertNotEqual(results, None)
     self.assertEqual(results[0]['key_term'], 'Apples.')
     self.assertEqual(results[0]['locations'], [0])
    def test_emphasis_close_to_front(self):
        """ An emphasized word is close to the front, but is not a key term.
        """

        node = Node('(a) T et seq. has a list: apples',
                    label=['101', '22', 'a'])
        node.tagged_text = '(a) T <E T="03">et seq.</E> has a list: apples'

        kt = KeyTerms(None)
        self.assertFalse(kt.keyterm_is_first(node, 'et seq.'))
 def test_interpretation_markers(self):
     node = Node('3. et seq. has a list: apples',
                 label=['101', 'c', Node.INTERP_MARK, '3'],
                 node_type=Node.INTERP)
     node.tagged_text = '3. <E T="03">et seq.</E> has a list: apples'
     kt = KeyTerms(None)
     results = kt.process(node)
     self.assertNotEqual(results, None)
     self.assertEqual(results[0]['key_term'], 'et seq.')
     self.assertEqual(results[0]['locations'], [0])
 def _assert_finds(self, tagged_text, *refs):
     """Compare the derived results to an expected number of references"""
     finder = def_finders.XMLTermMeans()
     text = re.sub(r"<[^>]*>", "", tagged_text)  # removes tags
     node = Node(text)
     node.tagged_text = tagged_text
     actual = finder.find(node)
     self.assertEqual(len(refs), len(actual))
     for ref, actual in zip(refs, actual):
         self.assertEqual(ref.term, actual.term)
         self.assertEqual(ref.start, actual.start)
Ejemplo n.º 18
0
 def test_node_definitions_multiple_xml(self):
     """Find xml definitions which are separated by `and`"""
     stack = ParentStack().add(0, Node(label=['9999']))
     winter = Node("(4) Cold and dreary mean winter.", label=['9999', '4'])
     winter.tagged_text = ('(4) <E T="03">Cold</E> and '
                           '<E T="03">dreary</E> mean winter.')
     inc, _ = Terms(None).node_definitions(winter, stack)
     self.assertEqual(len(inc), 2)
     cold, dreary = inc
     self.assertEqual(cold, Ref('cold', '9999-4', 4))
     self.assertEqual(dreary, Ref('dreary', '9999-4', 13))
Ejemplo n.º 19
0
    def test_keyterm_see(self):
        """ Keyterm tags sometimes enclose phrases such as 'See also' because
        those tags are also used for emphasis. """

        node = Node('(a) Apples. See Section 101.2',
                    label=['101', '22', 'a'])
        node.tagged_text = '(a) <E T="03">Apples. See also</E>'

        kt = KeyTerms(None)
        results = kt.process(node)
        self.assertEqual('Apples.', results[0]['key_term'])
 def test_node_definitions_multiple_xml(self):
     """Find xml definitions which are separated by `and`"""
     stack = ParentStack().add(0, Node(label=['9999']))
     winter = Node("(4) Cold and dreary mean winter.", label=['9999', '4'])
     winter.tagged_text = ('(4) <E T="03">Cold</E> and '
                           '<E T="03">dreary</E> mean winter.')
     inc, _ = Terms(None).node_definitions(winter, stack)
     self.assertEqual(len(inc), 2)
     cold, dreary = inc
     self.assertEqual(cold, Ref('cold', '9999-4', 4))
     self.assertEqual(dreary, Ref('dreary', '9999-4', 13))
 def assert_finds_result(self, tagged_text, parent_title, *refs):
     """Given the tags and a title for a parent node, verify that the
     provided references are found"""
     parent = Node(label=['1000', '1'], title=parent_title)
     node = Node(re.sub(r"<[^>]*>", "", tagged_text))  # removes tags
     node.tagged_text = tagged_text
     results = def_finders.DefinitionKeyterm(parent).find(node)
     self.assertEqual(len(results), len(refs))
     for expected, actual in zip(refs, results):
         self.assertEqual(expected.term, actual.term)
         self.assertEqual(expected.start, actual.start)
    def test_emphasis_later(self):
        """ Don't pick up something that is emphasized later in a paragraph as
        a key-term. """

        node = Node('(a) This has a list: apples et seq.',
                    label=['101', '22', 'a'])
        node.tagged_text = '(a) This has a list: apples <E T="03">et seq.</E>'

        kt = KeyTerms(None)
        results = kt.process(node)
        self.assertEqual(results, None)
Ejemplo n.º 23
0
 def test_node_definitions_xml_or(self):
     """Find xml definitions which are separated by `or`"""
     stack = ParentStack().add(0, Node(label=['9999']))
     tamale = Node("(i) Hot tamale or tamale means nom nom",
                   label=['9999', '4'])
     tamale.tagged_text = ('(i) <E T="03">Hot tamale</E> or <E T="03"> '
                           'tamale</E> means nom nom ')
     inc, _ = Terms(None).node_definitions(tamale, stack)
     self.assertEqual(len(inc), 2)
     hot, tamale = inc
     self.assertEqual(hot, Ref('hot tamale', '9999-4', 4))
     self.assertEqual(tamale, Ref('tamale', '9999-4', 18))
Ejemplo n.º 24
0
    def derive_nodes(self, xml, processor=None):
        nodes = []
        plain_text = ''
        for marker, plain_text, tagged_text in split_by_markers(xml):
            node = Node(text=plain_text.strip(), label=[marker],
                        source_xml=xml)
            node.tagged_text = six.text_type(tagged_text.strip())
            nodes.append(node)

        if plain_text.endswith('* * *'):    # last in loop
            nodes.append(Node(label=[mtypes.INLINE_STARS]))
        return nodes
 def test_node_definitions_xml_or(self):
     """Find xml definitions which are separated by `or`"""
     stack = ParentStack().add(0, Node(label=['9999']))
     tamale = Node("(i) Hot tamale or tamale means nom nom",
                   label=['9999', '4'])
     tamale.tagged_text = ('(i) <E T="03">Hot tamale</E> or <E T="03"> '
                           'tamale</E> means nom nom ')
     inc, _ = Terms(None).node_definitions(tamale, stack)
     self.assertEqual(len(inc), 2)
     hot, tamale = inc
     self.assertEqual(hot, Ref('hot tamale', '9999-4', 4))
     self.assertEqual(tamale, Ref('tamale', '9999-4', 18))
Ejemplo n.º 26
0
 def test_node_definitions_xml_commas(self):
     """Find xml definitions which have commas separating them"""
     stack = ParentStack().add(0, Node(label=['9999']))
     summer = Node("(i) Hot, humid, or dry means summer.",
                   label=['9999', '4'])
     summer.tagged_text = ('(i) <E T="03">Hot</E>, <E T="03">humid</E>, '
                           'or <E T="03">dry</E> means summer.')
     inc, _ = Terms(None).node_definitions(summer, stack)
     self.assertEqual(len(inc), 3)
     hot, humid, dry = inc
     self.assertEqual(hot, Ref('hot', '9999-4', 4))
     self.assertEqual(humid, Ref('humid', '9999-4', 9))
     self.assertEqual(dry, Ref('dry', '9999-4', 19))
Ejemplo n.º 27
0
    def derive_nodes(self, xml, processor=None):
        nodes = []
        plain_text = ''
        for marker, plain_text, tagged_text in split_by_markers(xml):
            node = Node(text=plain_text.strip(),
                        label=[marker],
                        source_xml=xml)
            node.tagged_text = unicode(tagged_text.strip())
            nodes.append(node)

        if plain_text.endswith('* * *'):  # last in loop
            nodes.append(Node(label=[mtypes.INLINE_STARS]))
        return nodes
Ejemplo n.º 28
0
 def derive_nodes(self, xml, processor=None):
     text = ''
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = get_markers(tagged_text, self.next_marker(xml))
     nodes = []
     for m, node_text in get_markers_and_text(xml, markers_list):
         text, tagged_text = node_text
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = unicode(tagged_text.strip())
         nodes.append(node)
     if text.endswith('* * *'):
         nodes.append(Node(label=[mtypes.INLINE_STARS]))
     return nodes
 def test_node_definitions_xml_commas(self):
     """Find xml definitions which have commas separating them"""
     stack = ParentStack().add(0, Node(label=['9999']))
     summer = Node("(i) Hot, humid, or dry means summer.",
                   label=['9999', '4'])
     summer.tagged_text = ('(i) <E T="03">Hot</E>, <E T="03">humid</E>, '
                           'or <E T="03">dry</E> means summer.')
     inc, _ = Terms(None).node_definitions(summer, stack)
     self.assertEqual(len(inc), 3)
     hot, humid, dry = inc
     self.assertEqual(hot, Ref('hot', '9999-4', 4))
     self.assertEqual(humid, Ref('humid', '9999-4', 9))
     self.assertEqual(dry, Ref('dry', '9999-4', 19))
Ejemplo n.º 30
0
 def derive_nodes(self, xml, processor=None):
     nodes = []
     text = tree_utils.get_node_text(xml).strip()
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = self.paragraph_markers(text)
     with_parens = ['({})'.format(m) for m in markers_list]
     triplets = zip(markers_list, tree_utils.split_text(text, with_parens),
                    tree_utils.split_text(tagged_text, with_parens))
     for m, text, tagged_text in triplets:
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = unicode(tagged_text.strip())
         nodes.append(node)
     return nodes
    def derive_nodes(self, xml, processor=None):
        text = tree_utils.get_node_text(xml).strip()
        node = Node(text=text, source_xml=xml)
        node.tagged_text = six.text_type(
            tree_utils.get_node_text_tags_preserved(xml).strip())

        regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX
        match = regex.match(text)
        if match:
            node.label = [match.group('marker')]
        else:
            node.label = [mtypes.MARKERLESS]

        return [node]
    def derive_nodes(self, xml, processor=None):
        text = tree_utils.get_node_text(xml).strip()
        node = Node(text=text, source_xml=xml)
        node.tagged_text = unicode(
            tree_utils.get_node_text_tags_preserved(xml).strip())

        regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX
        match = regex.match(text)
        if match:
            node.label = [match.group('marker')]
        else:
            node.label = [mtypes.MARKERLESS]

        return [node]
Ejemplo n.º 33
0
 def derive_nodes(self, xml, processor=None):
     nodes = []
     text = tree_utils.get_node_text(xml).strip()
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = self.paragraph_markers(text)
     with_parens = ['({})'.format(m) for m in markers_list]
     triplets = zip(markers_list,
                    tree_utils.split_text(text, with_parens),
                    tree_utils.split_text(tagged_text, with_parens))
     for m, text, tagged_text in triplets:
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = six.text_type(tagged_text.strip())
         nodes.append(node)
     return nodes
    def test_node_definitions_multiple_xml(self):
        t = Terms(None)
        stack = ParentStack()
        stack.add(0, Node(label=['9999']))

        winter = Node("(4) Cold and dreary mean winter.", label=['9999', '4'])
        tagged = '(4) <E T="03">Cold</E> and <E T="03">dreary</E> mean '
        tagged += 'winter.'
        winter.tagged_text = tagged
        inc, _ = t.node_definitions(winter, stack)
        self.assertEqual(len(inc), 2)
        cold, dreary = inc
        self.assertEqual(cold, Ref('cold', '9999-4', (4, 8)))
        self.assertEqual(dreary, Ref('dreary', '9999-4', (13, 19)))

        summer = Node("(i) Hot, humid, or dry means summer.",
                      label=['9999', '4'])
        tagged = '(i) <E T="03">Hot</E>, <E T="03">humid</E>, or '
        tagged += '<E T="03">dry</E> means summer.'
        summer.tagged_text = tagged
        inc, _ = t.node_definitions(summer, stack)
        self.assertEqual(len(inc), 3)
        hot, humid, dry = inc
        self.assertEqual(hot, Ref('hot', '9999-4', (4, 7)))
        self.assertEqual(humid, Ref('humid', '9999-4', (9, 14)))
        self.assertEqual(dry, Ref('dry', '9999-4', (19, 22)))

        tamale = Node("(i) Hot tamale or tamale means nom nom",
                      label=['9999', '4'])
        tagged = '(i) <E T="03">Hot tamale</E> or <E T="03"> tamale</E> '
        tagged += 'means nom nom '
        tamale.tagged_text = tagged
        inc, _ = t.node_definitions(tamale, stack)
        self.assertEqual(len(inc), 2)
        hot, tamale = inc
        self.assertEqual(hot, Ref('hot tamale', '9999-4', (4, 14)))
        self.assertEqual(tamale, Ref('tamale', '9999-4', (18, 24)))
    def test_node_definitions_multiple_xml(self):
        t = Terms(None)
        stack = ParentStack()
        stack.add(0, Node(label=['9999']))

        winter = Node("(4) Cold and dreary mean winter.", label=['9999', '4'])
        tagged = '(4) <E T="03">Cold</E> and <E T="03">dreary</E> mean '
        tagged += 'winter.'
        winter.tagged_text = tagged
        inc, _ = t.node_definitions(winter, stack)
        self.assertEqual(len(inc), 2)
        cold, dreary = inc
        self.assertEqual(cold, Ref('cold', '9999-4', (4, 8)))
        self.assertEqual(dreary, Ref('dreary', '9999-4', (13, 19)))

        summer = Node("(i) Hot, humid, or dry means summer.",
                      label=['9999', '4'])
        tagged = '(i) <E T="03">Hot</E>, <E T="03">humid</E>, or '
        tagged += '<E T="03">dry</E> means summer.'
        summer.tagged_text = tagged
        inc, _ = t.node_definitions(summer, stack)
        self.assertEqual(len(inc), 3)
        hot, humid, dry = inc
        self.assertEqual(hot, Ref('hot', '9999-4', (4, 7)))
        self.assertEqual(humid, Ref('humid', '9999-4', (9, 14)))
        self.assertEqual(dry, Ref('dry', '9999-4', (19, 22)))

        tamale = Node("(i) Hot tamale or tamale means nom nom",
                      label=['9999', '4'])
        tagged = '(i) <E T="03">Hot tamale</E> or <E T="03"> tamale</E> '
        tagged += 'means nom nom '
        tamale.tagged_text = tagged
        inc, _ = t.node_definitions(tamale, stack)
        self.assertEqual(len(inc), 2)
        hot, tamale = inc
        self.assertEqual(hot, Ref('hot tamale', '9999-4', (4, 14)))
        self.assertEqual(tamale, Ref('tamale', '9999-4', (18, 24)))
Ejemplo n.º 36
0
def build_from_section(reg_part, section_xml):
    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = (subject_xml[0].text or '').strip()

    section_nums = []
    for match in re.finditer(r'%s\.(\d+[a-z]*)' % reg_part, section_no):
        secnum_candidate = match.group(1)
        if secnum_candidate.isdigit():
            secnum_candidate = int(secnum_candidate)
        section_nums.append(secnum_candidate)

    #  Merge spans longer than 3 sections
    section_span_end = None
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        if last - first + 1 > 3:
            section_span_end = str(last)
            section_nums = [first]
        else:
            section_nums = []
            for i in range(first, last + 1):
                section_nums.append(i)

    section_nodes = []
    for section_number in section_nums:
        section_number = str(section_number)
        section_text = (section_xml.text or '').strip()
        tagged_section_text = section_xml.text

        if section_span_end:
            section_title = u"§§ {}.{}-{}".format(
                reg_part, section_number, section_span_end)
        else:
            section_title = u"§ {}.{}".format(reg_part, section_number)
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        section_nodes.append(
            RegtextParagraphProcessor().process(section_xml, sect_node)
        )
    return section_nodes
Ejemplo n.º 37
0
def dict_to_node(node_dict):
    """ Convert a dictionary representation of a node into a Node object if
    it contains the minimum required fields. Otherwise, pass it through
    unchanged. """
    minimum_fields = set(('text', 'label', 'node_type'))
    if minimum_fields.issubset(node_dict.keys()):
        node = Node(node_dict['text'], [], node_dict['label'],
                    node_dict.get('title', None), node_dict['node_type'])
        if 'tagged_text' in node_dict:
            node.tagged_text = node_dict['tagged_text']
        if 'child_labels' in node_dict:
            node.child_labels = node_dict['child_labels']
        return node
    else:
        return node_dict
Ejemplo n.º 38
0
def dict_to_node(node_dict):
    """ Convert a dictionary representation of a node into a Node object if
    it contains the minimum required fields. Otherwise, pass it through
    unchanged. """
    minimum_fields = set(('text', 'label', 'node_type'))
    if minimum_fields.issubset(node_dict.keys()):
        node = Node(
            node_dict['text'], [], node_dict['label'],
            node_dict.get('title', None), node_dict['node_type'])
        if 'tagged_text' in node_dict:
            node.tagged_text = node_dict['tagged_text']
        if 'child_labels' in node_dict:
            node.child_labels = node_dict['child_labels']
        return node
    else:
        return node_dict
Ejemplo n.º 39
0
    def paragraph_with_marker(self, text, tagged_text):
        """The paragraph has a marker, like (a) or a. etc."""
        # To aid in determining collapsed paragraphs, replace any
        # keyterms present
        node_for_keyterms = Node(text, node_type=Node.APPENDIX)
        node_for_keyterms.tagged_text = tagged_text
        node_for_keyterms.label = [initial_marker(text)[0]]
        keyterm = KeyTerms.get_keyterm(node_for_keyterms)
        if keyterm:
            mtext = text.replace(keyterm, '.'*len(keyterm))
        else:
            mtext = text

        for mtext in split_paragraph_text(mtext):
            if keyterm:     # still need the original text
                mtext = mtext.replace('.'*len(keyterm), keyterm)
            node = Node(mtext, node_type=Node.APPENDIX,
                        label=[initial_marker(mtext)[0]])
            self.nodes.append(node)
Ejemplo n.º 40
0
def build_from_section(reg_part, section_xml):
    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = (subject_xml[0].text or '').strip()

    section_nums = []
    for match in re.finditer(r'%s\.(\d+[a-z]*)' % reg_part, section_no):
        secnum_candidate = match.group(1)
        if secnum_candidate.isdigit():
            secnum_candidate = int(secnum_candidate)
        section_nums.append(secnum_candidate)

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    section_nodes = []
    for section_number in section_nums:
        section_number = str(section_number)
        section_text = (section_xml.text or '').strip()
        tagged_section_text = section_xml.text

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        section_nodes.append(
            RegtextParagraphProcessor().process(section_xml, sect_node)
        )
    return section_nodes
Ejemplo n.º 41
0
def build_from_section(reg_part, section_xml):
    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = (subject_xml[0].text or '').strip()

    section_nums = []
    for match in re.finditer(r'%s\.(\d+[a-z]*)' % reg_part, section_no):
        secnum_candidate = match.group(1)
        if secnum_candidate.isdigit():
            secnum_candidate = int(secnum_candidate)
        section_nums.append(secnum_candidate)

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    section_nodes = []
    for section_number in section_nums:
        section_number = str(section_number)
        section_text = (section_xml.text or '').strip()
        tagged_section_text = section_xml.text

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(section_text,
                         label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        section_nodes.append(RegtextParagraphProcessor().process(
            section_xml, sect_node))
    return section_nodes
Ejemplo n.º 42
0
def child_with_marker(child_node, stack):
    """Machinery to build a node for an interp's inner child. Assumes the
    paragraph begins with a paragraph marker."""
    node_text = tree_utils.get_node_text(child_node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(child_node)
    first_marker = get_first_interp_marker(text_with_tags)

    collapsed = collapsed_markers_matches(node_text)

    #   -2 throughout to account for matching the character + period
    ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
    starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

    #   Node for this paragraph
    n = Node(node_text[0:starts[0]], label=[first_marker],
             node_type=Node.INTERP)
    n.tagged_text = text_with_tags
    last = stack.peek()

    if len(last) == 0:
        stack.push_last((interpretation_level(first_marker), n))
    else:
        node_level = interpretation_level(first_marker, last[0][0])
        if node_level is None:
            logging.warning("Couldn't determine node_level for this "
                            + "interpretation paragraph: " + n.text)
            node_level = last[0][0] + 1
        stack.add(node_level, n)

    #   Collapsed-marker children
    for match, end in zip(collapsed, ends):
        n = Node(node_text[match.end() - 2:end], label=[match.group(1)],
                 node_type=Node.INTERP)
        node_level = interpretation_level(match.group(1))
        last = stack.peek()
        if len(last) == 0:
            stack.push_last((node_level, n))
        else:
            stack.add(node_level, n)
    def test_dict_to_node(self):
        dict_node = {"text": "node text", "label": ["205", "A"], "node_type": "appendix"}

        node = compiler.dict_to_node(dict_node)

        self.assertEqual(node, Node("node text", [], ["205", "A"], None, "appendix"))

        dict_node["tagged_text"] = "<E> Tagged </E> text."

        node = compiler.dict_to_node(dict_node)

        actual_node = Node("node text", [], ["205", "A"], None, "appendix")
        actual_node.tagged_text = "<E> Tagged </E> text."

        created_node = compiler.dict_to_node(dict_node)

        self.assertEqual(actual_node, created_node)
        self.assertEqual(actual_node.tagged_text, created_node.tagged_text)

        dict_node = {"text": "node text"}

        node = compiler.dict_to_node(dict_node)
        self.assertEqual(node, dict_node)
Ejemplo n.º 44
0
def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []

    section_no = section_xml.xpath('SECTNO')[0].text
    section_no_without_marker = re.search('[0-9]+\.[0-9]+',
                                          section_no).group(0)
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    manual_hierarchy_flag = False
    if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[
            reg_part]:
        manual_hierarchy_flag = True

    # Collect paragraph markers and section text (intro text for the
    # section)
    i = 0
    children = [
        ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS']
    ]
    for ch in children:
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            # is this a bunch of definitions that don't have numbers next to them?
            if len(nodes) > 0:
                if (subject_text.find('Definitions.') > -1
                        or nodes[-1].text.find(
                            'For the purposes of this section')):
                    #TODO: create a grammar for definitions
                    if text.find('means') > -1:
                        def_marker = text.split('means')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    elif text.find('shall have the same meaning') > -1:
                        def_marker = text.split('shall')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    else:
                        def_marker = 'def{0}'.format(i)
                        i += 1
                    n = Node(text, label=[def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    #nodes[-1].children.append(n)
                    nodes.append(n)
                else:
                    section_texts.append((text, tagged_text))
            else:
                if len(children) > 1:
                    def_marker = 'def{0}'.format(i)
                    n = Node(text, [], [def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    i += 1
                    nodes.append(n)
                else:
                    # this is the only node around
                    section_texts.append((text, tagged_text))

        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)

            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    m_stack = tree_utils.NodeStack()

    # Use constraint programming to figure out possible depth assignments
    if not manual_hierarchy_flag:
        depths = derive_depths([n.label[0] for n in nodes], [
            rules.depth_type_order([
                mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper,
                mtypes.em_ints, mtypes.em_roman
            ])
        ])

    if not manual_hierarchy_flag and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]

        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    elif nodes and manual_hierarchy_flag:
        logging.warning('Using manual depth hierarchy.')
        depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker]
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + depth, node))
                else:
                    m_stack.add(1 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!'
                ' ({0} nodes but {1} provided)'.format(len(nodes),
                                                       len(depths)))

    elif nodes and not manual_hierarchy_flag:
        logging.warning(
            'Could not determine depth when parsing {0}:\n{1}'.format(
                section_no_without_marker, [n.label[0] for n in nodes]))
        for node in nodes:
            last = m_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                m_stack.push_last((3, node))
            else:
                m_stack.add(3, node)

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)

        sect_node = Node(section_text,
                         label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes
Ejemplo n.º 45
0
def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []
    # Collect paragraph markers and section text (intro text for the
    # section)
    for ch in filter(lambda ch: ch.tag in ('P', 'STARS'),
                     section_xml.getchildren()):
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)
            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    m_stack = tree_utils.NodeStack()
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)
        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes
def process_inner_children(inner_stack, xml_node, parent=None):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    # manual hierarchy should work here too
    manual_hierarchy = []
    try:
        part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0)
        part, section = part_and_section.split('.')
        part_and_section += '-Interp'

        if (part in PARAGRAPH_HIERARCHY
                and part_and_section in PARAGRAPH_HIERARCHY[part]):
            manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section]
    except Exception:
        pass

    children = itertools.takewhile(lambda x: not is_title(x),
                                   xml_node.itersiblings())
    nodes = []
    for i, xml_node in enumerate(
            filter(lambda c: c.tag in ('P', 'STARS'), children)):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)

        # If the node has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if xml_node.get("depth") is not None:
            manual_hierarchy.append(int(xml_node.get("depth")))

        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes and manual_hierarchy:
            logging.warning("Couldn't determine interp marker. "
                            "Manual hierarchy is specified")

            n = Node(node_text, label=[str(i)], node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)

        elif not first_marker and not manual_hierarchy:
            logging.warning(
                "Couldn't determine interp marker. Appending to "
                "previous paragraph: %s", node_text)

            if nodes:
                previous = nodes[-1]
            else:
                previous = parent

            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags

        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]],
                     label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end],
                         label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    # use manual hierarchy if it's specified
    if not manual_hierarchy:
        depths = derive_depths([node.label[0] for node in nodes], [
            rules.depth_type_order(
                [(mtypes.ints, mtypes.em_ints),
                 (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper,
                 mtypes.em_ints, mtypes.em_roman])
        ])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + depth, node))
                else:
                    inner_stack.add(3 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!')

    elif nodes and not manual_hierarchy:
        logging.warning('Could not derive depth (interp):\n {}'.format(
            [node.label[0] for node in nodes]))
        # just add nodes in sequential order then
        for node in nodes:
            last = inner_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                inner_stack.push_last((3, node))
            else:
                inner_stack.add(3, node)
Ejemplo n.º 47
0
 def derive_nodes(self, xml, processor=None):
     node = Node(table_xml_to_plaintext(xml),
                 label=[mtypes.MARKERLESS],
                 source_xml=xml)
     node.tagged_text = etree.tounicode(xml).strip()
     return [node]
Ejemplo n.º 48
0
def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
    def test_node_definitions(self):
        t = Terms(None)
        smart_quotes = [
            (u'This has a “worD” and then more',
             [Ref('word', 'aaa', (12, 16))]),
            (u'I have “anotheR word” term and “moree”',
             [Ref('another word', 'bbb', (8, 20)),
              Ref('moree', 'bbb', (32, 37))]),
            (u'But the child “DoeS sEe”?',
             [Ref('does see', 'ccc', (15, 23))]),
            (u'Start with “this,”', [Ref('this', 'hhh', (12, 16))]),
            (u'Start with “this;”', [Ref('this', 'iii', (12, 16))]),
            (u'Start with “this.”', [Ref('this', 'jjj', (12, 16))]),
            (u'As do “subchildren”',
             [Ref('subchildren', 'ddd', (7, 18))])]

        no_defs = [
            u'This has no defs',
            u'Also has no terms',
            u'Still no terms, but',
            u'the next one does']

        xml_defs = [
            (u'(4) Thing means a thing that is defined',
             u'(4) <E T="03">Thing</E> means a thing that is defined',
             Ref('thing', 'eee', (4, 9))),
            (u'(e) Well-meaning lawyers means people who do weird things',
             u'(e) <E T="03">Well-meaning lawyers</E> means people who do '
             + 'weird things',
             Ref('well-meaning lawyers', 'fff', (4, 24))),
            (u'(e) Words have the same meaning as in a dictionary',
             u'(e) <E T="03">Words</E> have the same meaning as in a '
             + 'dictionary',
             Ref('words', 'ffg', (4, 9))),
            (u'(e) Banana has the same meaning as bonono',
             u'(e) <E T="03">Banana</E> has the same meaning as bonono',
             Ref('banana', 'fgf', (4, 10))),
            (u'(f) Huge billowy clouds means I want to take a nap',
             u'(f) <E T="03">Huge billowy clouds</E> means I want to take a '
             + 'nap',
             Ref('huge billowy clouds', 'ggg', (4, 23)))]

        xml_no_defs = [
            (u'(d) Term1 or term2 means stuff',
             u'(d) <E T="03">Term1</E> or <E T="03">term2></E> means stuff'),
            (u'This term means should not match',
             u'<E T="03">This term</E> means should not match')]

        scope_term_defs = [
            ('For purposes of this section, the term blue means the color',
             Ref('blue', '11-11', (39, 43))),
            ('For purposes of paragraph (a)(1) of this section, the term '
             + 'cool bro means hip cat', Ref('cool bro', '11-22', (59, 67))),
            ('For purposes of this paragraph, po jo means "poor Joe"',
             Ref('po jo', '11-33', (32, 37)))]

        stack = ParentStack()
        stack.add(0, Node(label=['999']))
        for txt in no_defs:
            defs, exc = t.node_definitions(Node(txt), stack)
            self.assertEqual([], defs)
            self.assertEqual([], exc)
        for txt, refs in smart_quotes:
            defs, exc = t.node_definitions(Node(txt), stack)
            self.assertEqual([], defs)
            self.assertEqual([], exc)
        for txt, xml in xml_no_defs:
            node = Node(txt)
            node.tagged_text = xml
            defs, exc = t.node_definitions(node, stack)
            self.assertEqual([], defs)
            self.assertEqual([], exc)
        for txt, xml, ref in xml_defs:
            node = Node(txt, label=[ref.label])
            node.tagged_text = xml
            defs, exc = t.node_definitions(node, stack)
            self.assertEqual([ref], defs)
            self.assertEqual([], exc)
        for txt, ref in scope_term_defs:
            defs, exc = t.node_definitions(
                Node(txt, label=ref.label.split('-')), stack)
            self.assertEqual([ref], defs)
            self.assertEqual([], exc)

        #   smart quotes are affected by the parent
        stack.add(1, Node('Definitions', label=['999', '1']))
        for txt in no_defs:
            defs, exc = t.node_definitions(Node(txt), stack)
            self.assertEqual([], defs)
            self.assertEqual([], exc)
        for txt, refs in smart_quotes:
            defs, exc = t.node_definitions(Node(txt, label=[refs[0].label]),
                                           stack)
            self.assertEqual(refs, defs)
            self.assertEqual([], exc)
        for txt, xml in xml_no_defs:
            node = Node(txt)
            node.tagged_text = xml
            defs, exc = t.node_definitions(node, stack)
            self.assertEqual([], defs)
            self.assertEqual([], exc)
        for txt, xml, ref in xml_defs:
            node = Node(txt, label=[ref.label])
            node.tagged_text = xml
            defs, exc = t.node_definitions(node, stack)
            self.assertEqual([ref], defs)
            self.assertEqual([], exc)
Ejemplo n.º 50
0
def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [node.label[0] for node in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)