def __init__(self, *args, **kwargs): Layer.__init__(self, *args, **kwargs) self.layer['referenced'] = {} # scope -> List[(term, definition_ref)] self.scoped_terms = defaultdict(list) self.scope_finder = ScopeFinder() self._inflected = {}
def setUp(self): self.finder = ScopeFinder() self.stack = ParentStack()
class ScopeFinderTest(TestCase): def setUp(self): self.finder = ScopeFinder() self.stack = ParentStack() def add_nodes(self, length): """There's a common prefix of nodes we'll add""" label = ['1000', '3', 'd', '6', 'iii'] for i in range(length): self.stack.add(i, Node(label=label[:i+1])) def assert_scope(self, *scopes): self.assertEqual(list(scopes), self.finder.determine_scope(self.stack)) def test_determine_scope_default(self): """Defaults to the entire reg""" self.add_nodes(2) self.assert_scope(('1000',)) def test_determine_scope_this_part(self): """Definitions scoped to a part also cover the interpretations for that part""" self.add_nodes(1) self.stack.add(1, Node('For the purposes of this part, blah blah', label=['1001', '3'])) self.assert_scope(('1001',), ('1001', Node.INTERP_MARK)) def test_determine_scope_this_subpart(self): """Subpart scope gets expanded to include other sections in the same subpart""" self.finder.subpart_map = { 'SubPart 1': ['A', '3'], 'Other': [] } self.add_nodes(2) self.stack.add(2, Node('For the purposes of this subpart, yada yada', label=['1000', '3', 'c'])) self.assert_scope(('1000', 'A'), ('1000', '3'), ('1000', 'A', Node.INTERP_MARK), ('1000', '3', Node.INTERP_MARK)) def test_determine_scope_this_section(self): """Section scope can be triggered in a child paragraph""" self.add_nodes(2) self.stack.add(2, Node('For the purposes of this section, blah blah', label=['1000', '3', 'd'])) self.assert_scope(('1000', '3'), ('1000', '3', Node.INTERP_MARK)) def test_determine_scope_this_paragraph(self): """Paragraph scope is tied to the paragraph that determined it. Previous paragraph scopes won't apply to adjacent children""" self.add_nodes(2) self.stack.add(2, Node('For the purposes of this section, blah blah', label=['1000', '3', 'd'])) self.stack.add(3, Node('For the purposes of this paragraph, blah blah', label=['1000', '3', 'd', '5'])) self.assert_scope(('1000', '3', 'd', '5'), ('1000', '3', 'd', '5', Node.INTERP_MARK)) self.stack.add(3, Node(label=['1002', '3', 'd', '6'])) self.assert_scope(('1000', '3'), ('1000', '3', Node.INTERP_MARK)) self.stack.add(3, Node('Blah as used in this paragraph, blah blah', label=['1000', '3', 'd', '7'])) self.assert_scope(('1000', '3', 'd', '7'), ('1000', '3', 'd', '7', Node.INTERP_MARK)) def test_determine_scope_purposes_of_specific_paragraph(self): self.add_nodes(4) self.stack.add( 4, Node(u'For the purposes of this § 1000.3(d)(6)(i), blah', label=['1000', '3', 'd', '6', 'i'])) self.assert_scope(('1000', '3', 'd', '6', 'i'), ('1000', '3', 'd', '6', 'i', Node.INTERP_MARK)) def test_determine_scope_purposes_of_specific_section(self): self.add_nodes(4) self.stack.add(4, Node(u'For the purposes of § 1000.3, blah', label=['1000', '3', 'd', '6', 'ii'])) self.assert_scope(('1000', '3'), ('1000', '3', Node.INTERP_MARK)) def test_determine_scope_as_used_in_thi_section(self): self.add_nodes(4) self.stack.add(4, Node('As used in this section, blah blah', label=['1000', '3', 'd', '6', 'iii'])) self.assert_scope(('1000', '3'), ('1000', '3', Node.INTERP_MARK)) def test_subpart_scope(self): self.finder.subpart_map = { None: ['1', '2', '3'], 'A': ['7', '5', '0'], 'Q': ['99', 'abc', 'q'] } self.assertEqual([['111', '1'], ['111', '2'], ['111', '3']], self.finder.subpart_scope(['111', '3'])) self.assertEqual([['115', '7'], ['115', '5'], ['115', '0']], self.finder.subpart_scope(['115', '5'])) self.assertEqual([['62', '99'], ['62', 'abc'], ['62', 'q']], self.finder.subpart_scope(['62', 'abc'])) self.assertEqual([], self.finder.subpart_scope(['71', 'Z']))
class Terms(Layer): def __init__(self, *args, **kwargs): Layer.__init__(self, *args, **kwargs) self.layer['referenced'] = {} # scope -> List[(term, definition_ref)] self.scoped_terms = defaultdict(list) self.scope_finder = ScopeFinder() def pre_process(self): """Step through every node in the tree, finding definitions. Add these definition to self.scoped_terms. Also keep track of which subpart we are in. Finally, document all defined terms. """ self.scope_finder.add_subparts(self.tree) stack = ParentStack() def per_node(node): stack.add(node.depth(), node) if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART, struct.Node.EMPTYPART): included, excluded = self.node_definitions(node, stack) if included: for scope in self.scope_finder.determine_scope(stack): self.scoped_terms[scope].extend(included) self.scoped_terms['EXCLUDED'].extend(excluded) struct.walk(self.tree, per_node) referenced = self.layer['referenced'] for scope in self.scoped_terms: for ref in self.scoped_terms[scope]: key = ref.term + ":" + ref.label if (key not in referenced # New term # Or this term is earlier in the paragraph or ref.start < referenced[key]['position'][0]): referenced[key] = { 'term': ref.term, 'reference': ref.label, 'position': ref.position } def applicable_terms(self, label): """Find all terms that might be applicable to nodes with this label. Note that we don't have to deal with subparts as subpart_scope simply applies the definition to all sections in a subpart""" applicable_terms = {} for segment_length in range(1, len(label) + 1): scope = tuple(label[:segment_length]) for ref in self.scoped_terms.get(scope, []): applicable_terms[ref.term] = ref # overwrites return applicable_terms def is_exclusion(self, term, node): """Some definitions are exceptions/exclusions of a previously defined term. At the moment, we do not want to include these as they would replace previous (correct) definitions.""" applicable_terms = self.applicable_terms(node.label) if term in applicable_terms: regex = 'the term .?' + re.escape(term) + '.? does not include' return bool(re.search(regex, node.text.lower())) return False def node_definitions(self, node, stack=None): """Find defined terms in this node's text.""" references = [] stack = stack or ParentStack() for finder in (def_finders.ExplicitIncludes(), def_finders.SmartQuotes(stack), def_finders.ScopeMatch(self.scope_finder), def_finders.XMLTermMeans(references), def_finders.DefinitionKeyterm(stack.parent_of(node))): # Note that `extend` is very important as XMLTermMeans uses the # list reference references.extend(finder.find(node)) references = [r for r in references if len(r.term) <= MAX_TERM_LENGTH] return ( [r for r in references if not self.is_exclusion(r.term, node)], [r for r in references if self.is_exclusion(r.term, node)]) def process(self, node): """Determine which (if any) definitions would apply to this node, then find if any of those terms appear in this node""" applicable_terms = self.applicable_terms(node.label) layer_el = [] # Remove any definitions defined in this paragraph term_list = [ (term, ref) for term, ref in applicable_terms.iteritems() if ref.label != node.label_id()] exclusions = self.excluded_offsets(node.label_id(), node.text) exclusions = self.per_regulation_ignores( exclusions, node.label, node.text) inclusions = self.included_offsets(node.label_id(), node.text) inclusions = self.per_regulation_includes( inclusions, node.label, node.text) matches = self.calculate_offsets(node.text, term_list, exclusions) for term, ref, offsets in matches: layer_el.append({ "ref": ref.term + ':' + ref.label, "offsets": offsets }) return layer_el def _word_matches(self, term, text): """Return the start and end indexes of the term within the text, accounting for word boundaries""" return [(match.start(), match.end()) for match in re.finditer(r'\b' + re.escape(term) + r'\b', text)] def per_regulation_ignores(self, exclusions, label, text): cfr_part = label[0] if settings.IGNORE_DEFINITIONS_IN.get(cfr_part): for ignore_term in settings.IGNORE_DEFINITIONS_IN[cfr_part]: exclusions.extend(self._word_matches(ignore_term, text)) return exclusions def excluded_offsets(self, label, text): """We explicitly exclude certain chunks of text (for example, words we are defining shouldn't have links appear within the defined term.) More will be added in the future""" exclusions = [] for reflist in self.scoped_terms.values(): exclusions.extend( ref.position for ref in reflist if ref.label == label) for ignore_term in settings.IGNORE_DEFINITIONS_IN['ALL']: exclusions.extend(self._word_matches(ignore_term, text)) return exclusions def per_regulation_includes(self, inclusions, label, text): cfr_part = label[0] if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part): all_includes = settings.INCLUDE_DEFINITIONS_IN['ALL'] for included_term, context in all_includes: inclusions.extend(self._word_matches(included_term, text)) return inclusions def included_offsets(self, label, text): """ We explicitly include certain chunks of text (for example, words that the parser doesn't necessarily pick up as being defined) that should be part of a defined term """ inclusions = [] for included_term, context in settings.INCLUDE_DEFINITIONS_IN['ALL']: inclusions.extend(self._word_matches(included_term, text)) return inclusions def calculate_offsets(self, text, applicable_terms, exclusions=[], inclusions=[]): """Search for defined terms in this text, with a preference for all larger (i.e. containing) terms.""" # don't modify the original exclusions = list(exclusions) inclusions = list(inclusions) # add plurals to applicable terms pluralized = [(inflection.pluralize(t[0]), t[1]) for t in applicable_terms] applicable_terms += pluralized # longer terms first applicable_terms.sort(key=lambda x: len(x[0]), reverse=True) matches = [] for term, ref in applicable_terms: re_term = ur'\b' + re.escape(term) + ur'\b' offsets = [ (m.start(), m.end()) for m in re.finditer(re_term, text.lower())] safe_offsets = [] for start, end in offsets: # Start is contained in an existing def if any(start >= e[0] and start <= e[1] for e in exclusions): continue # End is contained in an existing def if any(end >= e[0] and end <= e[1] for e in exclusions): continue safe_offsets.append((start, end)) if not safe_offsets: continue exclusions.extend(safe_offsets) matches.append((term, ref, safe_offsets)) return matches
def setUp(self): self.finder = def_finders.ScopeMatch(ScopeFinder())
class Terms(Layer): shorthand = 'terms' STARTS_WITH_WORDCHAR = re.compile('^\w.*$') ENDS_WITH_WORDCHAR = re.compile('^.*\w$') def __init__(self, *args, **kwargs): Layer.__init__(self, *args, **kwargs) self.layer['referenced'] = {} # scope -> List[(term, definition_ref)] self.scoped_terms = defaultdict(list) self.scope_finder = ScopeFinder() def look_for_defs(self, node, stack=None): """Check a node and recursively check its children for terms which are being defined. Add these definitions to self.scoped_terms.""" stack = stack or ParentStack() stack.add(node.depth(), node) if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART, struct.Node.EMPTYPART): included, excluded = self.node_definitions(node, stack) if included: for scope in self.scope_finder.determine_scope(stack): self.scoped_terms[scope].extend(included) self.scoped_terms['EXCLUDED'].extend(excluded) for child in node.children: self.look_for_defs(child, stack) def pre_process(self): """Step through every node in the tree, finding definitions. Also keep track of which subpart we are in. Finally, document all defined terms. """ self.scope_finder.add_subparts(self.tree) self.look_for_defs(self.tree) referenced = self.layer['referenced'] for scope in self.scoped_terms: for ref in self.scoped_terms[scope]: key = ref.term + ":" + ref.label if (key not in referenced or # New term # Or this term is earlier in the paragraph ref.start < referenced[key]['position'][0]): referenced[key] = { 'term': ref.term, 'reference': ref.label, 'position': ref.position } def applicable_terms(self, label): """Find all terms that might be applicable to nodes with this label. Note that we don't have to deal with subparts as subpart_scope simply applies the definition to all sections in a subpart""" applicable_terms = {} for segment_length in range(1, len(label) + 1): scope = tuple(label[:segment_length]) for ref in self.scoped_terms.get(scope, []): applicable_terms[ref.term] = ref # overwrites return applicable_terms def is_exclusion(self, term, node): """Some definitions are exceptions/exclusions of a previously defined term. At the moment, we do not want to include these as they would replace previous (correct) definitions. We also remove terms which are inside an instance of the IGNORE_DEFINITIONS_IN setting""" applicable_terms = self.applicable_terms(node.label) if term in applicable_terms: regex = 'the term .?' + re.escape(term) + '.? does not include' if re.search(regex, node.text.lower()): return True for start, end in self.ignored_offsets(node.label[0], node.text): if term in node.text[start:end]: return True return False def node_definitions(self, node, stack=None): """Find defined terms in this node's text.""" references = [] stack = stack or ParentStack() for finder in (def_finders.ExplicitIncludes(), def_finders.SmartQuotes(stack), def_finders.ScopeMatch(self.scope_finder), def_finders.XMLTermMeans(references), def_finders.DefinitionKeyterm(stack.parent_of(node))): # Note that `extend` is very important as XMLTermMeans uses the # list reference references.extend(finder.find(node)) references = [r for r in references if len(r.term) <= MAX_TERM_LENGTH] return ([r for r in references if not self.is_exclusion(r.term, node)], [r for r in references if self.is_exclusion(r.term, node)]) def process(self, node): """Determine which (if any) definitions would apply to this node, then find if any of those terms appear in this node""" applicable_terms = self.applicable_terms(node.label) layer_el = [] # Remove any definitions defined in this paragraph term_list = [(term, ref) for term, ref in applicable_terms.iteritems() if ref.label != node.label_id()] exclusions = self.excluded_offsets(node) matches = self.calculate_offsets(node.text, term_list, exclusions) matches = sorted(matches, key=lambda (term, r, o): term) for term, ref, offsets in matches: layer_el.append({ "ref": ref.term + ':' + ref.label, "offsets": offsets }) return layer_el def _word_matches(self, term, text): """Return the start and end indexes of the term within the text, accounting for word boundaries""" # @todo - this is rather slow -- probably want to memoize the results regex = re.escape(term) if self.STARTS_WITH_WORDCHAR.match(term): regex = r'\b' + regex if self.ENDS_WITH_WORDCHAR.match(term): regex += r'\b' regex = re.compile(regex) return [(match.start(), match.end()) for match in regex.finditer(text)] def ignored_offsets(self, cfr_part, text): """Return a list of offsets corresponding to the presence of an "ignored" phrase in the text""" ignored_phrases = (settings.IGNORE_DEFINITIONS_IN.get('ALL', []) + settings.IGNORE_DEFINITIONS_IN.get(cfr_part, [])) positions = [] for phrase in ignored_phrases: positions.extend(self._word_matches(phrase, text)) return positions def excluded_offsets(self, node): """We explicitly exclude certain chunks of text (for example, words we are defining shouldn't have links appear within the defined term.) More will be added in the future""" exclusions = [] for reflist in self.scoped_terms.values(): exclusions.extend(ref.position for ref in reflist if ref.label == node.label_id()) exclusions.extend(self.ignored_offsets(node.label[0], node.text)) return exclusions def calculate_offsets(self, text, applicable_terms, exclusions=[], inclusions=[]): """Search for defined terms in this text, including singular and plural forms of these terms, with a preference for all larger (i.e. containing) terms.""" # don't modify the original exclusions = list(exclusions) inclusions = list(inclusions) # add singulars and plurals to search terms search_terms = set( (inflection.singularize(t[0]), t[1]) for t in applicable_terms) search_terms |= set( (inflection.pluralize(t[0]), t[1]) for t in applicable_terms) # longer terms first search_terms = sorted(search_terms, key=lambda x: len(x[0]), reverse=True) matches = [] for term, ref in search_terms: re_term = ur'\b' + re.escape(term) + ur'\b' offsets = [(m.start(), m.end()) for m in re.finditer(re_term, text.lower())] safe_offsets = [] for start, end in offsets: # Start is contained in an existing def if any(start >= e[0] and start <= e[1] for e in exclusions): continue # End is contained in an existing def if any(end >= e[0] and end <= e[1] for e in exclusions): continue safe_offsets.append((start, end)) if not safe_offsets: continue exclusions.extend(safe_offsets) matches.append((term, ref, safe_offsets)) return matches
class Terms(Layer): shorthand = 'terms' STARTS_WITH_WORDCHAR = re.compile(r'^\w.*$') ENDS_WITH_WORDCHAR = re.compile(r'^.*\w$') def __init__(self, *args, **kwargs): Layer.__init__(self, *args, **kwargs) self.layer['referenced'] = {} # scope -> List[(term, definition_ref)] self.scoped_terms = defaultdict(list) self.scope_finder = ScopeFinder() self._inflected = {} def inflected(self, term): """Check the memoized Inflected version of the provided term""" if term not in self._inflected: self._inflected[term] = Inflected( inflection.singularize(term), inflection.pluralize(term)) return self._inflected[term] def look_for_defs(self, node, stack=None): """Check a node and recursively check its children for terms which are being defined. Add these definitions to self.scoped_terms.""" stack = stack or ParentStack() stack.add(node.depth(), node) if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART, struct.Node.EMPTYPART): included, excluded = self.node_definitions(node, stack) if included: for scope in self.scope_finder.determine_scope(stack): self.scoped_terms[scope].extend(included) self.scoped_terms['EXCLUDED'].extend(excluded) for child in node.children: self.look_for_defs(child, stack) def pre_process(self): """Step through every node in the tree, finding definitions. Also keep track of which subpart we are in. Finally, document all defined terms. """ self.scope_finder.add_subparts(self.tree) self.look_for_defs(self.tree) referenced = self.layer['referenced'] for scope in self.scoped_terms: for ref in self.scoped_terms[scope]: key = ref.term + ":" + ref.label if (key not in referenced or # New term # Or this term is earlier in the paragraph ref.start < referenced[key]['position'][0]): referenced[key] = { 'term': ref.term, 'reference': ref.label, 'position': ref.position } def applicable_terms(self, label): """Find all terms that might be applicable to nodes with this label. Note that we don't have to deal with subparts as subpart_scope simply applies the definition to all sections in a subpart""" applicable_terms = {} for segment_length in range(1, len(label) + 1): scope = tuple(label[:segment_length]) for ref in self.scoped_terms.get(scope, []): applicable_terms[ref.term] = ref # overwrites return applicable_terms def is_exclusion(self, term, node): """Some definitions are exceptions/exclusions of a previously defined term. At the moment, we do not want to include these as they would replace previous (correct) definitions. We also remove terms which are inside an instance of the IGNORE_DEFINITIONS_IN setting""" applicable_terms = self.applicable_terms(node.label) if term in applicable_terms: regex = 'the term .?' + re.escape(term) + '.? does not include' if re.search(regex, node.text.lower()): return True for start, end in self.ignored_offsets(node.label[0], node.text): if term in node.text[start:end]: return True return False def node_definitions(self, node, stack=None): """Find defined terms in this node's text.""" references = [] stack = stack or ParentStack() for finder in (def_finders.ExplicitIncludes(), def_finders.SmartQuotes(stack), def_finders.ScopeMatch(self.scope_finder), def_finders.XMLTermMeans(references), def_finders.DefinitionKeyterm(stack.parent_of(node))): # Note that `extend` is very important as XMLTermMeans uses the # list reference references.extend(finder.find(node)) references = [r for r in references if len(r.term) <= MAX_TERM_LENGTH] return ( [r for r in references if not self.is_exclusion(r.term, node)], [r for r in references if self.is_exclusion(r.term, node)]) def process(self, node): """Determine which (if any) definitions would apply to this node, then find if any of those terms appear in this node""" applicable_terms = self.applicable_terms(node.label) layer_el = [] # Remove any definitions defined in this paragraph term_list = [ (term, ref) for term, ref in applicable_terms.items() if ref.label != node.label_id()] exclusions = self.excluded_offsets(node) matches = self.calculate_offsets(node.text, term_list, exclusions) matches = sorted(matches, key=lambda triplet: triplet[0]) for _, ref, offsets in matches: layer_el.append({ "ref": ref.term + ':' + ref.label, "offsets": offsets }) return layer_el def _word_matches(self, term, text): """Return the start and end indexes of the term within the text, accounting for word boundaries""" # @todo - this is rather slow -- probably want to memoize the results regex = re.escape(term) if self.STARTS_WITH_WORDCHAR.match(term): regex = r'\b' + regex if self.ENDS_WITH_WORDCHAR.match(term): regex += r'\b' regex = re.compile(regex) return [(match.start(), match.end()) for match in regex.finditer(text)] def ignored_offsets(self, cfr_part, text): """Return a list of offsets corresponding to the presence of an "ignored" phrase in the text""" ignored_phrases = (settings.IGNORE_DEFINITIONS_IN.get('ALL', []) + settings.IGNORE_DEFINITIONS_IN.get(cfr_part, [])) positions = [] for phrase in ignored_phrases: positions.extend(self._word_matches(phrase, text)) return positions def excluded_offsets(self, node): """We explicitly exclude certain chunks of text (for example, words we are defining shouldn't have links appear within the defined term.) More will be added in the future""" exclusions = [] for reflist in self.scoped_terms.values(): exclusions.extend( ref.position for ref in reflist if ref.label == node.label_id()) exclusions.extend(self.ignored_offsets(node.label[0], node.text)) return exclusions def calculate_offsets(self, text, applicable_terms, exclusions=None, inclusions=None): """Search for defined terms in this text, including singular and plural forms of these terms, with a preference for all larger (i.e. containing) terms.""" # don't modify the original exclusions = list(exclusions or []) inclusions = list(inclusions or []) # add singulars and plurals to search terms search_terms = {(inflected, t[1]) for t in applicable_terms for inflected in self.inflected(t[0])} # longer terms first search_terms = sorted(search_terms, key=lambda x: len(x[0]), reverse=True) matches = [] for term, ref in search_terms: re_term = r'\b' + re.escape(term) + r'\b' offsets = [ (m.start(), m.end()) for m in re.finditer(re_term, text.lower())] safe_offsets = [] for start, end in offsets: # Start is contained in an existing def if any(start >= e[0] and start <= e[1] for e in exclusions): continue # End is contained in an existing def if any(end >= e[0] and end <= e[1] for e in exclusions): continue safe_offsets.append((start, end)) if not safe_offsets: continue exclusions.extend(safe_offsets) matches.append((term, ref, safe_offsets)) return matches