Example #1
0
 def __init__(self, *args, **kwargs):
     Layer.__init__(self, *args, **kwargs)
     self.layer['referenced'] = {}
     #   scope -> List[(term, definition_ref)]
     self.scoped_terms = defaultdict(list)
     self.scope_finder = ScopeFinder()
     self._inflected = {}
Example #2
0
 def setUp(self):
     self.finder = ScopeFinder()
     self.stack = ParentStack()
Example #3
0
class ScopeFinderTest(TestCase):
    def setUp(self):
        self.finder = ScopeFinder()
        self.stack = ParentStack()

    def add_nodes(self, length):
        """There's a common prefix of nodes we'll add"""
        label = ['1000', '3', 'd', '6', 'iii']
        for i in range(length):
            self.stack.add(i, Node(label=label[:i+1]))

    def assert_scope(self, *scopes):
        self.assertEqual(list(scopes), self.finder.determine_scope(self.stack))

    def test_determine_scope_default(self):
        """Defaults to the entire reg"""
        self.add_nodes(2)
        self.assert_scope(('1000',))

    def test_determine_scope_this_part(self):
        """Definitions scoped to a part also cover the interpretations for
        that part"""
        self.add_nodes(1)
        self.stack.add(1, Node('For the purposes of this part, blah blah',
                               label=['1001', '3']))
        self.assert_scope(('1001',), ('1001', Node.INTERP_MARK))

    def test_determine_scope_this_subpart(self):
        """Subpart scope gets expanded to include other sections in the same
        subpart"""
        self.finder.subpart_map = {
            'SubPart 1': ['A', '3'],
            'Other': []
        }
        self.add_nodes(2)
        self.stack.add(2, Node('For the purposes of this subpart, yada yada',
                               label=['1000', '3', 'c']))
        self.assert_scope(('1000', 'A'), ('1000', '3'),
                          ('1000', 'A', Node.INTERP_MARK),
                          ('1000', '3', Node.INTERP_MARK))

    def test_determine_scope_this_section(self):
        """Section scope can be triggered in a child paragraph"""
        self.add_nodes(2)
        self.stack.add(2, Node('For the purposes of this section, blah blah',
                               label=['1000', '3', 'd']))
        self.assert_scope(('1000', '3'), ('1000', '3', Node.INTERP_MARK))

    def test_determine_scope_this_paragraph(self):
        """Paragraph scope is tied to the paragraph that determined it.
        Previous paragraph scopes won't apply to adjacent children"""
        self.add_nodes(2)
        self.stack.add(2, Node('For the purposes of this section, blah blah',
                               label=['1000', '3', 'd']))
        self.stack.add(3, Node('For the purposes of this paragraph, blah blah',
                               label=['1000', '3', 'd', '5']))
        self.assert_scope(('1000', '3', 'd', '5'),
                          ('1000', '3', 'd', '5', Node.INTERP_MARK))

        self.stack.add(3, Node(label=['1002', '3', 'd', '6']))
        self.assert_scope(('1000', '3'), ('1000', '3', Node.INTERP_MARK))

        self.stack.add(3, Node('Blah as used in this paragraph, blah blah',
                               label=['1000', '3', 'd', '7']))
        self.assert_scope(('1000', '3', 'd', '7'),
                          ('1000', '3', 'd', '7', Node.INTERP_MARK))

    def test_determine_scope_purposes_of_specific_paragraph(self):
        self.add_nodes(4)
        self.stack.add(
            4, Node(u'For the purposes of this § 1000.3(d)(6)(i), blah',
                    label=['1000', '3', 'd', '6', 'i']))
        self.assert_scope(('1000', '3', 'd', '6', 'i'),
                          ('1000', '3', 'd', '6', 'i', Node.INTERP_MARK))

    def test_determine_scope_purposes_of_specific_section(self):
        self.add_nodes(4)
        self.stack.add(4, Node(u'For the purposes of § 1000.3, blah',
                               label=['1000', '3', 'd', '6', 'ii']))
        self.assert_scope(('1000', '3'), ('1000', '3', Node.INTERP_MARK))

    def test_determine_scope_as_used_in_thi_section(self):
        self.add_nodes(4)
        self.stack.add(4, Node('As used in this section, blah blah',
                               label=['1000', '3', 'd', '6', 'iii']))
        self.assert_scope(('1000', '3'), ('1000', '3', Node.INTERP_MARK))

    def test_subpart_scope(self):
        self.finder.subpart_map = {
            None: ['1', '2', '3'],
            'A': ['7', '5', '0'],
            'Q': ['99', 'abc', 'q']
        }
        self.assertEqual([['111', '1'], ['111', '2'], ['111', '3']],
                         self.finder.subpart_scope(['111', '3']))
        self.assertEqual([['115', '7'], ['115', '5'], ['115', '0']],
                         self.finder.subpart_scope(['115', '5']))
        self.assertEqual([['62', '99'], ['62', 'abc'], ['62', 'q']],
                         self.finder.subpart_scope(['62', 'abc']))
        self.assertEqual([], self.finder.subpart_scope(['71', 'Z']))
Example #4
0
class Terms(Layer):
    def __init__(self, *args, **kwargs):
        Layer.__init__(self, *args, **kwargs)
        self.layer['referenced'] = {}
        #   scope -> List[(term, definition_ref)]
        self.scoped_terms = defaultdict(list)
        self.scope_finder = ScopeFinder()

    def pre_process(self):
        """Step through every node in the tree, finding definitions. Add
        these definition to self.scoped_terms. Also keep track of which
        subpart we are in. Finally, document all defined terms. """
        self.scope_finder.add_subparts(self.tree)
        stack = ParentStack()

        def per_node(node):
            stack.add(node.depth(), node)
            if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART,
                                  struct.Node.EMPTYPART):
                included, excluded = self.node_definitions(node, stack)
                if included:
                    for scope in self.scope_finder.determine_scope(stack):
                        self.scoped_terms[scope].extend(included)
                self.scoped_terms['EXCLUDED'].extend(excluded)

        struct.walk(self.tree, per_node)

        referenced = self.layer['referenced']
        for scope in self.scoped_terms:
            for ref in self.scoped_terms[scope]:
                key = ref.term + ":" + ref.label
                if (key not in referenced     # New term
                        # Or this term is earlier in the paragraph
                        or ref.start < referenced[key]['position'][0]):
                    referenced[key] = {
                        'term': ref.term,
                        'reference': ref.label,
                        'position': ref.position
                    }

    def applicable_terms(self, label):
        """Find all terms that might be applicable to nodes with this label.
        Note that we don't have to deal with subparts as subpart_scope simply
        applies the definition to all sections in a subpart"""
        applicable_terms = {}
        for segment_length in range(1, len(label) + 1):
            scope = tuple(label[:segment_length])
            for ref in self.scoped_terms.get(scope, []):
                applicable_terms[ref.term] = ref    # overwrites
        return applicable_terms

    def is_exclusion(self, term, node):
        """Some definitions are exceptions/exclusions of a previously
        defined term. At the moment, we do not want to include these as they
        would replace previous (correct) definitions."""
        applicable_terms = self.applicable_terms(node.label)
        if term in applicable_terms:
            regex = 'the term .?' + re.escape(term) + '.? does not include'
            return bool(re.search(regex, node.text.lower()))
        return False

    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text."""
        references = []
        stack = stack or ParentStack()
        for finder in (def_finders.ExplicitIncludes(),
                       def_finders.SmartQuotes(stack),
                       def_finders.ScopeMatch(self.scope_finder),
                       def_finders.XMLTermMeans(references),
                       def_finders.DefinitionKeyterm(stack.parent_of(node))):
            # Note that `extend` is very important as XMLTermMeans uses the
            # list reference
            references.extend(finder.find(node))

        references = [r for r in references if len(r.term) <= MAX_TERM_LENGTH]

        return (
            [r for r in references if not self.is_exclusion(r.term, node)],
            [r for r in references if self.is_exclusion(r.term, node)])

    def process(self, node):
        """Determine which (if any) definitions would apply to this node,
        then find if any of those terms appear in this node"""
        applicable_terms = self.applicable_terms(node.label)

        layer_el = []
        #   Remove any definitions defined in this paragraph
        term_list = [
            (term, ref) for term, ref in applicable_terms.iteritems()
            if ref.label != node.label_id()]

        exclusions = self.excluded_offsets(node.label_id(), node.text)
        exclusions = self.per_regulation_ignores(
            exclusions, node.label, node.text)

        inclusions = self.included_offsets(node.label_id(), node.text)
        inclusions = self.per_regulation_includes(
            inclusions, node.label, node.text)

        matches = self.calculate_offsets(node.text, term_list, exclusions)
        for term, ref, offsets in matches:
            layer_el.append({
                "ref": ref.term + ':' + ref.label,
                "offsets": offsets
                })
        return layer_el

    def _word_matches(self, term, text):
        """Return the start and end indexes of the term within the text,
        accounting for word boundaries"""
        return [(match.start(), match.end()) for match in
                re.finditer(r'\b' + re.escape(term) + r'\b', text)]

    def per_regulation_ignores(self, exclusions, label, text):
        cfr_part = label[0]
        if settings.IGNORE_DEFINITIONS_IN.get(cfr_part):
            for ignore_term in settings.IGNORE_DEFINITIONS_IN[cfr_part]:
                exclusions.extend(self._word_matches(ignore_term, text))
        return exclusions

    def excluded_offsets(self, label, text):
        """We explicitly exclude certain chunks of text (for example, words
        we are defining shouldn't have links appear within the defined
        term.) More will be added in the future"""
        exclusions = []
        for reflist in self.scoped_terms.values():
            exclusions.extend(
                ref.position for ref in reflist if ref.label == label)
        for ignore_term in settings.IGNORE_DEFINITIONS_IN['ALL']:
            exclusions.extend(self._word_matches(ignore_term, text))
        return exclusions

    def per_regulation_includes(self, inclusions, label, text):
        cfr_part = label[0]
        if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part):
            all_includes = settings.INCLUDE_DEFINITIONS_IN['ALL']
            for included_term, context in all_includes:
                inclusions.extend(self._word_matches(included_term, text))
        return inclusions

    def included_offsets(self, label, text):
        """ We explicitly include certain chunks of text (for example,
            words that the parser doesn't necessarily pick up as being
            defined) that should be part of a defined term """
        inclusions = []
        for included_term, context in settings.INCLUDE_DEFINITIONS_IN['ALL']:
            inclusions.extend(self._word_matches(included_term, text))
        return inclusions

    def calculate_offsets(self, text, applicable_terms, exclusions=[],
                          inclusions=[]):
        """Search for defined terms in this text, with a preference for all
        larger (i.e. containing) terms."""

        # don't modify the original
        exclusions = list(exclusions)
        inclusions = list(inclusions)

        # add plurals to applicable terms
        pluralized = [(inflection.pluralize(t[0]), t[1])
                      for t in applicable_terms]
        applicable_terms += pluralized

        #   longer terms first
        applicable_terms.sort(key=lambda x: len(x[0]), reverse=True)

        matches = []
        for term, ref in applicable_terms:
            re_term = ur'\b' + re.escape(term) + ur'\b'
            offsets = [
                (m.start(), m.end())
                for m in re.finditer(re_term, text.lower())]
            safe_offsets = []
            for start, end in offsets:
                #   Start is contained in an existing def
                if any(start >= e[0] and start <= e[1] for e in exclusions):
                    continue
                #   End is contained in an existing def
                if any(end >= e[0] and end <= e[1] for e in exclusions):
                    continue
                safe_offsets.append((start, end))
            if not safe_offsets:
                continue

            exclusions.extend(safe_offsets)
            matches.append((term, ref, safe_offsets))
        return matches
 def setUp(self):
     self.finder = def_finders.ScopeMatch(ScopeFinder())
Example #6
0
class Terms(Layer):
    shorthand = 'terms'
    STARTS_WITH_WORDCHAR = re.compile('^\w.*$')
    ENDS_WITH_WORDCHAR = re.compile('^.*\w$')

    def __init__(self, *args, **kwargs):
        Layer.__init__(self, *args, **kwargs)
        self.layer['referenced'] = {}
        #   scope -> List[(term, definition_ref)]
        self.scoped_terms = defaultdict(list)
        self.scope_finder = ScopeFinder()

    def look_for_defs(self, node, stack=None):
        """Check a node and recursively check its children for terms which are
        being defined. Add these definitions to self.scoped_terms."""
        stack = stack or ParentStack()
        stack.add(node.depth(), node)
        if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART,
                              struct.Node.EMPTYPART):
            included, excluded = self.node_definitions(node, stack)
            if included:
                for scope in self.scope_finder.determine_scope(stack):
                    self.scoped_terms[scope].extend(included)
            self.scoped_terms['EXCLUDED'].extend(excluded)

            for child in node.children:
                self.look_for_defs(child, stack)

    def pre_process(self):
        """Step through every node in the tree, finding definitions. Also keep
        track of which subpart we are in. Finally, document all defined terms.
        """
        self.scope_finder.add_subparts(self.tree)
        self.look_for_defs(self.tree)

        referenced = self.layer['referenced']
        for scope in self.scoped_terms:
            for ref in self.scoped_terms[scope]:
                key = ref.term + ":" + ref.label
                if (key not in referenced or  # New term
                        # Or this term is earlier in the paragraph
                        ref.start < referenced[key]['position'][0]):
                    referenced[key] = {
                        'term': ref.term,
                        'reference': ref.label,
                        'position': ref.position
                    }

    def applicable_terms(self, label):
        """Find all terms that might be applicable to nodes with this label.
        Note that we don't have to deal with subparts as subpart_scope simply
        applies the definition to all sections in a subpart"""
        applicable_terms = {}
        for segment_length in range(1, len(label) + 1):
            scope = tuple(label[:segment_length])
            for ref in self.scoped_terms.get(scope, []):
                applicable_terms[ref.term] = ref  # overwrites
        return applicable_terms

    def is_exclusion(self, term, node):
        """Some definitions are exceptions/exclusions of a previously
        defined term. At the moment, we do not want to include these as they
        would replace previous (correct) definitions. We also remove terms
        which are inside an instance of the IGNORE_DEFINITIONS_IN setting"""
        applicable_terms = self.applicable_terms(node.label)
        if term in applicable_terms:
            regex = 'the term .?' + re.escape(term) + '.? does not include'
            if re.search(regex, node.text.lower()):
                return True
            for start, end in self.ignored_offsets(node.label[0], node.text):
                if term in node.text[start:end]:
                    return True
        return False

    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text."""
        references = []
        stack = stack or ParentStack()
        for finder in (def_finders.ExplicitIncludes(),
                       def_finders.SmartQuotes(stack),
                       def_finders.ScopeMatch(self.scope_finder),
                       def_finders.XMLTermMeans(references),
                       def_finders.DefinitionKeyterm(stack.parent_of(node))):
            # Note that `extend` is very important as XMLTermMeans uses the
            # list reference
            references.extend(finder.find(node))

        references = [r for r in references if len(r.term) <= MAX_TERM_LENGTH]

        return ([r for r in references if not self.is_exclusion(r.term, node)],
                [r for r in references if self.is_exclusion(r.term, node)])

    def process(self, node):
        """Determine which (if any) definitions would apply to this node,
        then find if any of those terms appear in this node"""
        applicable_terms = self.applicable_terms(node.label)

        layer_el = []
        #   Remove any definitions defined in this paragraph
        term_list = [(term, ref) for term, ref in applicable_terms.iteritems()
                     if ref.label != node.label_id()]

        exclusions = self.excluded_offsets(node)

        matches = self.calculate_offsets(node.text, term_list, exclusions)
        matches = sorted(matches, key=lambda (term, r, o): term)
        for term, ref, offsets in matches:
            layer_el.append({
                "ref": ref.term + ':' + ref.label,
                "offsets": offsets
            })
        return layer_el

    def _word_matches(self, term, text):
        """Return the start and end indexes of the term within the text,
        accounting for word boundaries"""
        # @todo - this is rather slow -- probably want to memoize the results
        regex = re.escape(term)
        if self.STARTS_WITH_WORDCHAR.match(term):
            regex = r'\b' + regex
        if self.ENDS_WITH_WORDCHAR.match(term):
            regex += r'\b'
        regex = re.compile(regex)
        return [(match.start(), match.end()) for match in regex.finditer(text)]

    def ignored_offsets(self, cfr_part, text):
        """Return a list of offsets corresponding to the presence of an
        "ignored" phrase in the text"""
        ignored_phrases = (settings.IGNORE_DEFINITIONS_IN.get('ALL', []) +
                           settings.IGNORE_DEFINITIONS_IN.get(cfr_part, []))
        positions = []
        for phrase in ignored_phrases:
            positions.extend(self._word_matches(phrase, text))
        return positions

    def excluded_offsets(self, node):
        """We explicitly exclude certain chunks of text (for example, words
        we are defining shouldn't have links appear within the defined
        term.) More will be added in the future"""
        exclusions = []
        for reflist in self.scoped_terms.values():
            exclusions.extend(ref.position for ref in reflist
                              if ref.label == node.label_id())
        exclusions.extend(self.ignored_offsets(node.label[0], node.text))
        return exclusions

    def calculate_offsets(self,
                          text,
                          applicable_terms,
                          exclusions=[],
                          inclusions=[]):
        """Search for defined terms in this text, including singular and
        plural forms of these terms, with a preference for all larger
        (i.e. containing) terms."""

        # don't modify the original
        exclusions = list(exclusions)
        inclusions = list(inclusions)

        # add singulars and plurals to search terms
        search_terms = set(
            (inflection.singularize(t[0]), t[1]) for t in applicable_terms)
        search_terms |= set(
            (inflection.pluralize(t[0]), t[1]) for t in applicable_terms)

        # longer terms first
        search_terms = sorted(search_terms,
                              key=lambda x: len(x[0]),
                              reverse=True)

        matches = []
        for term, ref in search_terms:
            re_term = ur'\b' + re.escape(term) + ur'\b'
            offsets = [(m.start(), m.end())
                       for m in re.finditer(re_term, text.lower())]
            safe_offsets = []
            for start, end in offsets:
                #   Start is contained in an existing def
                if any(start >= e[0] and start <= e[1] for e in exclusions):
                    continue
                #   End is contained in an existing def
                if any(end >= e[0] and end <= e[1] for e in exclusions):
                    continue
                safe_offsets.append((start, end))
            if not safe_offsets:
                continue

            exclusions.extend(safe_offsets)
            matches.append((term, ref, safe_offsets))
        return matches
Example #7
0
class Terms(Layer):
    shorthand = 'terms'
    STARTS_WITH_WORDCHAR = re.compile(r'^\w.*$')
    ENDS_WITH_WORDCHAR = re.compile(r'^.*\w$')

    def __init__(self, *args, **kwargs):
        Layer.__init__(self, *args, **kwargs)
        self.layer['referenced'] = {}
        #   scope -> List[(term, definition_ref)]
        self.scoped_terms = defaultdict(list)
        self.scope_finder = ScopeFinder()
        self._inflected = {}

    def inflected(self, term):
        """Check the memoized Inflected version of the provided term"""
        if term not in self._inflected:
            self._inflected[term] = Inflected(
                inflection.singularize(term), inflection.pluralize(term))
        return self._inflected[term]

    def look_for_defs(self, node, stack=None):
        """Check a node and recursively check its children for terms which are
        being defined. Add these definitions to self.scoped_terms."""
        stack = stack or ParentStack()
        stack.add(node.depth(), node)
        if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART,
                              struct.Node.EMPTYPART):
            included, excluded = self.node_definitions(node, stack)
            if included:
                for scope in self.scope_finder.determine_scope(stack):
                    self.scoped_terms[scope].extend(included)
            self.scoped_terms['EXCLUDED'].extend(excluded)

            for child in node.children:
                self.look_for_defs(child, stack)

    def pre_process(self):
        """Step through every node in the tree, finding definitions. Also keep
        track of which subpart we are in. Finally, document all defined terms.
        """
        self.scope_finder.add_subparts(self.tree)
        self.look_for_defs(self.tree)

        referenced = self.layer['referenced']
        for scope in self.scoped_terms:
            for ref in self.scoped_terms[scope]:
                key = ref.term + ":" + ref.label
                if (key not in referenced or  # New term
                        # Or this term is earlier in the paragraph
                        ref.start < referenced[key]['position'][0]):
                    referenced[key] = {
                        'term': ref.term,
                        'reference': ref.label,
                        'position': ref.position
                    }

    def applicable_terms(self, label):
        """Find all terms that might be applicable to nodes with this label.
        Note that we don't have to deal with subparts as subpart_scope simply
        applies the definition to all sections in a subpart"""
        applicable_terms = {}
        for segment_length in range(1, len(label) + 1):
            scope = tuple(label[:segment_length])
            for ref in self.scoped_terms.get(scope, []):
                applicable_terms[ref.term] = ref    # overwrites
        return applicable_terms

    def is_exclusion(self, term, node):
        """Some definitions are exceptions/exclusions of a previously
        defined term. At the moment, we do not want to include these as they
        would replace previous (correct) definitions. We also remove terms
        which are inside an instance of the IGNORE_DEFINITIONS_IN setting"""
        applicable_terms = self.applicable_terms(node.label)
        if term in applicable_terms:
            regex = 'the term .?' + re.escape(term) + '.? does not include'
            if re.search(regex, node.text.lower()):
                return True
            for start, end in self.ignored_offsets(node.label[0], node.text):
                if term in node.text[start:end]:
                    return True
        return False

    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text."""
        references = []
        stack = stack or ParentStack()
        for finder in (def_finders.ExplicitIncludes(),
                       def_finders.SmartQuotes(stack),
                       def_finders.ScopeMatch(self.scope_finder),
                       def_finders.XMLTermMeans(references),
                       def_finders.DefinitionKeyterm(stack.parent_of(node))):
            # Note that `extend` is very important as XMLTermMeans uses the
            # list reference
            references.extend(finder.find(node))

        references = [r for r in references if len(r.term) <= MAX_TERM_LENGTH]

        return (
            [r for r in references if not self.is_exclusion(r.term, node)],
            [r for r in references if self.is_exclusion(r.term, node)])

    def process(self, node):
        """Determine which (if any) definitions would apply to this node,
        then find if any of those terms appear in this node"""
        applicable_terms = self.applicable_terms(node.label)

        layer_el = []
        #   Remove any definitions defined in this paragraph
        term_list = [
            (term, ref) for term, ref in applicable_terms.items()
            if ref.label != node.label_id()]

        exclusions = self.excluded_offsets(node)

        matches = self.calculate_offsets(node.text, term_list, exclusions)
        matches = sorted(matches, key=lambda triplet: triplet[0])
        for _, ref, offsets in matches:
            layer_el.append({
                "ref": ref.term + ':' + ref.label,
                "offsets": offsets
            })
        return layer_el

    def _word_matches(self, term, text):
        """Return the start and end indexes of the term within the text,
        accounting for word boundaries"""
        # @todo - this is rather slow -- probably want to memoize the results
        regex = re.escape(term)
        if self.STARTS_WITH_WORDCHAR.match(term):
            regex = r'\b' + regex
        if self.ENDS_WITH_WORDCHAR.match(term):
            regex += r'\b'
        regex = re.compile(regex)
        return [(match.start(), match.end())
                for match in regex.finditer(text)]

    def ignored_offsets(self, cfr_part, text):
        """Return a list of offsets corresponding to the presence of an
        "ignored" phrase in the text"""
        ignored_phrases = (settings.IGNORE_DEFINITIONS_IN.get('ALL', []) +
                           settings.IGNORE_DEFINITIONS_IN.get(cfr_part, []))
        positions = []
        for phrase in ignored_phrases:
            positions.extend(self._word_matches(phrase, text))
        return positions

    def excluded_offsets(self, node):
        """We explicitly exclude certain chunks of text (for example, words
        we are defining shouldn't have links appear within the defined
        term.) More will be added in the future"""
        exclusions = []
        for reflist in self.scoped_terms.values():
            exclusions.extend(
                ref.position for ref in reflist
                if ref.label == node.label_id())
        exclusions.extend(self.ignored_offsets(node.label[0], node.text))
        return exclusions

    def calculate_offsets(self, text, applicable_terms, exclusions=None,
                          inclusions=None):
        """Search for defined terms in this text, including singular and
        plural forms of these terms, with a preference for all larger
        (i.e. containing) terms."""

        # don't modify the original
        exclusions = list(exclusions or [])
        inclusions = list(inclusions or [])

        # add singulars and plurals to search terms
        search_terms = {(inflected, t[1])
                        for t in applicable_terms
                        for inflected in self.inflected(t[0])}

        # longer terms first
        search_terms = sorted(search_terms, key=lambda x: len(x[0]),
                              reverse=True)

        matches = []
        for term, ref in search_terms:
            re_term = r'\b' + re.escape(term) + r'\b'
            offsets = [
                (m.start(), m.end())
                for m in re.finditer(re_term, text.lower())]
            safe_offsets = []
            for start, end in offsets:
                #   Start is contained in an existing def
                if any(start >= e[0] and start <= e[1] for e in exclusions):
                    continue
                #   End is contained in an existing def
                if any(end >= e[0] and end <= e[1] for e in exclusions):
                    continue
                safe_offsets.append((start, end))
            if not safe_offsets:
                continue

            exclusions.extend(safe_offsets)
            matches.append((term, ref, safe_offsets))
        return matches