Beispiel #1
0
 def __init__(self, rules: List[List[str]] = None):
     if rules is None:
         with (Path(__file__).parent / 'negex_triggers.txt').open('r') as f:
             rules = make_rules(f)
     self.dawg = DAWG()
     for rule, tag in rules:
         self.dawg[rule] = tag
Beispiel #2
0
def test_matcher():
    dawg = DAWG()
    dawg[['a', 'b', 'c']] = True
    dawg[['a', 'b', 'd']] = False
    matcher = dawg.matcher()
    assert matcher.advance('x') == []
    assert matcher.advance('a') == []
    assert matcher.advance('b') == []
    assert matcher.advance('c') == [(3, True)]
    assert matcher.advance('d') == []
Beispiel #3
0
 def __init__(self, rules: List[List[str]] = None, tokens_range: int = 40):
     if rules is None:
         with (Path(__file__).parent / 'negex_triggers.txt').open('r') as f:
             rules = make_rules(f)
     self.dawg = DAWG()
     for rule, tag in rules:
         try:
             tags = self.dawg[rule]
         except KeyError:
             tags = []
             self.dawg[rule] = tags
         tags.append(tag)
     self.tokens_range = tokens_range
Beispiel #4
0
class NegexTriggerTagger:
    def __init__(self, rules: List[List[str]] = None, tokens_range: int = 40):
        if rules is None:
            with (Path(__file__).parent / 'negex_triggers.txt').open('r') as f:
                rules = make_rules(f)
        self.dawg = DAWG()
        for rule, tag in rules:
            try:
                tags = self.dawg[rule]
            except KeyError:
                tags = []
                self.dawg[rule] = tags
            tags.append(tag)
        self.tokens_range = tokens_range

    def detect_negex_triggers(self, sentence: str):
        # tokenize the sentence using a anti-whitespace pattern.
        tokens = []
        for match in _word_pattern.finditer(sentence):
            tokens.append((match.start(), match.end()))

        # use a DAWG matcher to locate all
        matcher = self.dawg.matcher()
        triggers = []
        for i, (begin, end) in enumerate(tokens):
            word = _not_word.sub('', sentence[begin:end].lower())
            if len(word) > 0:
                hits = matcher.advance(word)
                for length, tags in hits:
                    first_token_idx = i + 1 - length
                    triggers.append(
                        (tokens[first_token_idx][0], tokens[i][1], tags))
        return triggers
Beispiel #5
0
def test_delete():
    dawg = DAWG()
    dawg[['a', 'b', 'c']] = True
    dawg[['a', 'b', 'd']] = False
    del dawg[['a', 'b', 'd']]
    assert list(dawg) == [['a', 'b', 'c']]
    assert len(dawg) == 1
Beispiel #6
0
def test_create_dawg():
    dawg = DAWG()
    dawg[['a', 'b', 'c']] = True
    dawg[['a', 'b', 'd']] = False
    assert ['a', 'b', 'c'] in dawg
    assert ['a', 'b', 'd'] in dawg
    assert ['a', 'b'] not in dawg
    assert ['a', 'b', 'x'] not in dawg
    assert dawg[['a', 'b', 'c']]
    assert not dawg[['a', 'b', 'd']]
Beispiel #7
0
def test_delete_absent():
    dawg = DAWG()
    dawg[['a', 'b', 'c']] = True
    dawg[['a', 'b', 'd']] = False
    with pytest.raises(KeyError):
        del dawg[['a', 'b']]
Beispiel #8
0
def test_iter():
    dawg = DAWG()
    dawg[['a', 'b', 'c']] = True
    dawg[['a', 'b', 'd']] = False
    assert ['a', 'b', 'c'] in list(dawg)
    assert ['a', 'b', 'd'] in list(dawg)
Beispiel #9
0
class NegexTagger:
    def __init__(self, rules: List[List[str]] = None):
        if rules is None:
            with (Path(__file__).parent / 'negex_triggers.txt').open('r') as f:
                rules = make_rules(f)
        self.dawg = DAWG()
        for rule, tag in rules:
            self.dawg[rule] = tag

    def check_sentence(
        self, sentence: str, terms: List[Tuple[int, int]]
    ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]:
        """Checks the sentence for negated terms.

        Args:
            sentence (str): The sentence.
            terms (~typing.List[~typing.Tuple[int, int]]):
                A list of (start offset, end offset) tuples which indicate the locations of terms
                within the sentence to test for negation.

        Returns:
            negated terms (~typing.List[~typing.Tuple[int, int]]):
                The terms in the input which are negated. Start offset, end offset relative to the
                sentence.
            negation triggers (~typing.List[~typing.Tuple[int, int]]):
                The spans of text which are negation triggers.
        """
        if len(terms) == 0:
            return [], []
        # tokenize the sentence using a anti-whitespace pattern.
        tokens = []
        for match in _word_pattern.finditer(sentence):
            tokens.append((match.start(), match.end()))

        term_indices = []
        for (term_start, term_end) in terms:
            term_start_index = -1
            for i, (token_start, token_end) in enumerate(tokens):
                if term_start_index == -1 and token_start >= term_start:
                    term_start_index = i
                if token_end >= term_end:
                    term_indices.append((term_start_index, i))
                    break

        # use a DAWG matcher to locate all
        matcher = self.dawg.matcher()
        triggers = []
        for i, (begin, end) in enumerate(tokens):
            word = _not_word.sub('', sentence[begin:end].lower())
            if len(word) > 0:
                hits = matcher.advance(word)
                for length, tag in hits:
                    first_token_idx = i + 1 - length
                    triggers.append((first_token_idx, i, tag))

        negations = []
        neg_triggers = []

        for (term_start, term_end) in term_indices:
            for i, (trigger_start, trigger_end, tag) in enumerate(triggers):
                if term_start - trigger_end in range(1, 6):
                    if tag == 'PREN':
                        negations.append(
                            (tokens[term_start][0], tokens[term_end][1]))
                        neg_triggers.append(
                            (tokens[trigger_start][0], tokens[trigger_end][1]))
                        break
                if trigger_start - term_end in range(1, 6) and tag == 'POST':
                    negations.append(
                        (tokens[term_start][0], tokens[term_end][1]))
                    neg_triggers.append(
                        (tokens[trigger_start][0], tokens[trigger_end][1]))
                    break

        return negations, neg_triggers