Example #1
0
def main(sents, mbt_filename, option):
    """
    
    Arguments: 
    sents: an iterable with sentences.
    mbt_filename: the tagger settings' file.
    option: train or test

    """
    splits_lists = (split_sent(s.strip()) for s in f)
    splits_lists = (s for s in splits_lists if s) # remove non-tagged lines
    sents = functools.reduce(op.concat, splits_lists, [])
    if option == 'train':
        removes = (remove_tag(s, train=True) for s in sents)
        removes = [(s, f, c) for s, f, c in removes if c != AMBIGUOUS] # remove ambiguous findings
        clean_sents, finding_tokens, classes = zip(*removes)
    else:
        removes = [remove_tag(s) for s in sents]
        clean_sents, finding_tokens = zip(*removes)

    tok = tokenizer.Tokenizer()
    span_t_sents = [tok.span_tokenize(s) for s in clean_sents]
    assert len(span_t_sents) == len(clean_sents)
    t_sents = list()
    # prepare Mbt data
    for ts, s in zip(span_t_sents, clean_sents):
        t_sent = [s[slice(*t)] for t in ts]
        t_sent.append('<utt>')
        t_sents.append(t_sent)

    assert len(t_sents) == len(clean_sents)

    # call Mbt -s mbt_filename < t_sents
    p = subprocess.Popen(['/usr/bin/Mbt', '-s', mbt_filename], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    formatted_data = bytes('\n'.join(functools.reduce(op.concat, t_sents, [])), 'utf8')
    pos_tagged_data = p.communicate(formatted_data)[0]
    pos_tagged_sents = str(pos_tagged_data, 'utf8').split('<utt>')
    pos_tagged_sents = [s.strip() for s in pos_tagged_sents if s]
    pos_tagged_sents = [s for s in pos_tagged_sents if s]

    # print(pos_tagged_sents)
    assert_msg = 'len(pos_tagged_sents) == ' + str(len(pos_tagged_sents)) + '\nlen(clean_sents) == ' + str(len(clean_sents))
    assert len(pos_tagged_sents) == len(clean_sents), assert_msg

    ## sys.stdout.writelines([s+'\n' for s in pos_tagged_sents])

    # transform sents into lists of token-tuples
    pos_tuple_sents = list()
    for sent in pos_tagged_sents:
        tokens = sent.split()
        pos_tuple_sents.append([str2tuple(t) for t in tokens])

    
    token_sents = list()
    assert len(span_t_sents) == len(pos_tuple_sents)
    for span_sent, tag_sent, clean_sent in zip(span_t_sents, pos_tuple_sents, clean_sents):
        
        assert len(span_sent) == len(tag_sent)
        for span_t, (tag_w, tag_t) in zip(span_sent, tag_sent):
            assert tag_w == clean_sent[slice(*span_t)]
        
        token_sents.append([(s, e, t) for (s, e), (w, t) in zip(span_sent, tag_sent)])

    feature_records = list()
    neg_tagger = negex.Tagger()
    assert len(clean_sents) == len(finding_tokens)
    for sent, token_sent, finding_tok in zip(clean_sents, token_sents, finding_tokens):
        neg_tags = neg_tagger.neg_tag(sent)
        final_sent_tokens = merge_taglists([finding_tok], neg_tags)
        # print(final_sent_tokens)
        final_sent_tokens = merge_taglists(final_sent_tokens, token_sent)
        # print(final_sent_tokens)

        finding_index = final_sent_tokens.index(finding_tok)
        final_sent_tags = (tok[2] for tok in final_sent_tokens)
        change_comma = lambda t: 'comma' if t == ',' else t
        final_sent_tags = [change_comma(tag) for tag in final_sent_tags]
        
        feat_previous = final_sent_tags[max(0, finding_index-FEATURE_WINDOW):finding_index]
        previous_remaining = FEATURE_WINDOW - len(feat_previous)
        if previous_remaining:
            feat_previous = previous_remaining * ['NULL'] + feat_previous

        feat_following = final_sent_tags[finding_index+1:finding_index+1+FEATURE_WINDOW]
        following_remaining = FEATURE_WINDOW - len(feat_following)
        if following_remaining:
            feat_following += following_remaining * ['NULL']

        feature_records.append(','.join(feat_previous + ['FIND'] + feat_following))

    if option == 'train':
        feature_records = (f + ',' + c for f, c in zip(feature_records, classes))

    sys.stdout.writelines([f+'\n' for f in feature_records])
Example #2
0
    def annotate(self, sentence, token_window=-1):
        """Returns the sentence annotated with negation classification.

        Arguments:
        sentence - string to be tagged.
        token_window - the number of tokens, to the left or right of the trigger, inside the scope of the trigger term.
        If it is less than or equal zero, the scope goes until the end or beginning of the sentence.

        """
        self.original_sentence = sentence
        all_spans = list()
        # tokenize all findings
        all_spans = self.word_tokenizer.tokenize_findings(sentence, all_spans)

        # Identify triggers and tag them
        neg_tags = self.neg_tag(sentence)
        all_spans = internals.merge_taglists(all_spans, neg_tags)
        
        # tokenize the rest
        token_spans = self.word_tokenizer.span_tokenize(sentence)
        all_spans = internals.merge_taglists(all_spans, [(s, e, 'TOKE') for s, e in token_spans])
            
        triggers = list()
        findings = dict()
        for i, tok in enumerate(all_spans):
            tagname = all_spans[i][2]
            trigger_tag_match = self.trigger_tag_pat.search(tagname)
            if trigger_tag_match:
                trigger_scope = self._scope(all_spans, i, tagname, token_window)
                if len(trigger_scope) > 0:
                    scope_start = trigger_scope[0][0]
                    scope_end = trigger_scope[-1][1]
                    name = sentence[all_spans[i][0]:all_spans[i][1]]
                    trig = Trigger(name, tagname)
                    trig.set_scope(sentence[scope_start:scope_end])
                    trig.set_span(scope_start, scope_end)
                    triggers.append(trig)
            elif tagname == 'FIND':
                f_start = all_spans[i][0]
                f_end = all_spans[i][1]
                f_text = sentence[f_start:f_end]
                f_key = (f_start, f_end)
                
                findings[f_key] = Finding(f_text, f_start, f_end, ClassificationEnum.Affirmative)


        
        
        t_keyfunc = lambda t: self.precedence[t.tag()]
        triggers.sort(key=t_keyfunc)
        t_group = list()
        for k, g in itertools.groupby(triggers, t_keyfunc):
            t_group.append(list(g))

        findings_triggers = dict()
        # for overlapping scopes, the ascending order of precendence is POSP, PREP, POST, and PREN.
        for g in t_group:
            for t in g:
                t_span = t.span()
                for k, finding in findings.items():
                    if k[0] >= t_span[0] and k[1] <= t_span[1]:
                        # in the scope of t
                        finding.set_classification(t.classification())
                        findings_triggers[finding] = t

        
        return AnnotatedSentence(sentence, list(findings.values()), triggers, findings_triggers)