class NegexTriggerTagger: def __init__(self, rules: List[List[str]] = None, tokens_range: int = 40): if rules is None: with (Path(__file__).parent / 'negex_triggers.txt').open('r') as f: rules = make_rules(f) self.dawg = DAWG() for rule, tag in rules: try: tags = self.dawg[rule] except KeyError: tags = [] self.dawg[rule] = tags tags.append(tag) self.tokens_range = tokens_range def detect_negex_triggers(self, sentence: str): # tokenize the sentence using a anti-whitespace pattern. tokens = [] for match in _word_pattern.finditer(sentence): tokens.append((match.start(), match.end())) # use a DAWG matcher to locate all matcher = self.dawg.matcher() triggers = [] for i, (begin, end) in enumerate(tokens): word = _not_word.sub('', sentence[begin:end].lower()) if len(word) > 0: hits = matcher.advance(word) for length, tags in hits: first_token_idx = i + 1 - length triggers.append( (tokens[first_token_idx][0], tokens[i][1], tags)) return triggers
def test_matcher(): dawg = DAWG() dawg[['a', 'b', 'c']] = True dawg[['a', 'b', 'd']] = False matcher = dawg.matcher() assert matcher.advance('x') == [] assert matcher.advance('a') == [] assert matcher.advance('b') == [] assert matcher.advance('c') == [(3, True)] assert matcher.advance('d') == []
class NegexTagger: def __init__(self, rules: List[List[str]] = None): if rules is None: with (Path(__file__).parent / 'negex_triggers.txt').open('r') as f: rules = make_rules(f) self.dawg = DAWG() for rule, tag in rules: self.dawg[rule] = tag def check_sentence( self, sentence: str, terms: List[Tuple[int, int]] ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]: """Checks the sentence for negated terms. Args: sentence (str): The sentence. terms (~typing.List[~typing.Tuple[int, int]]): A list of (start offset, end offset) tuples which indicate the locations of terms within the sentence to test for negation. Returns: negated terms (~typing.List[~typing.Tuple[int, int]]): The terms in the input which are negated. Start offset, end offset relative to the sentence. negation triggers (~typing.List[~typing.Tuple[int, int]]): The spans of text which are negation triggers. """ if len(terms) == 0: return [], [] # tokenize the sentence using a anti-whitespace pattern. tokens = [] for match in _word_pattern.finditer(sentence): tokens.append((match.start(), match.end())) term_indices = [] for (term_start, term_end) in terms: term_start_index = -1 for i, (token_start, token_end) in enumerate(tokens): if term_start_index == -1 and token_start >= term_start: term_start_index = i if token_end >= term_end: term_indices.append((term_start_index, i)) break # use a DAWG matcher to locate all matcher = self.dawg.matcher() triggers = [] for i, (begin, end) in enumerate(tokens): word = _not_word.sub('', sentence[begin:end].lower()) if len(word) > 0: hits = matcher.advance(word) for length, tag in hits: first_token_idx = i + 1 - length triggers.append((first_token_idx, i, tag)) negations = [] neg_triggers = [] for (term_start, term_end) in term_indices: for i, (trigger_start, trigger_end, tag) in enumerate(triggers): if term_start - trigger_end in range(1, 6): if tag == 'PREN': negations.append( (tokens[term_start][0], tokens[term_end][1])) neg_triggers.append( (tokens[trigger_start][0], tokens[trigger_end][1])) break if trigger_start - term_end in range(1, 6) and tag == 'POST': negations.append( (tokens[term_start][0], tokens[term_end][1])) neg_triggers.append( (tokens[trigger_start][0], tokens[trigger_end][1])) break return negations, neg_triggers