def main(sents, mbt_filename, option): """ Arguments: sents: an iterable with sentences. mbt_filename: the tagger settings' file. option: train or test """ splits_lists = (split_sent(s.strip()) for s in f) splits_lists = (s for s in splits_lists if s) # remove non-tagged lines sents = functools.reduce(op.concat, splits_lists, []) if option == 'train': removes = (remove_tag(s, train=True) for s in sents) removes = [(s, f, c) for s, f, c in removes if c != AMBIGUOUS] # remove ambiguous findings clean_sents, finding_tokens, classes = zip(*removes) else: removes = [remove_tag(s) for s in sents] clean_sents, finding_tokens = zip(*removes) tok = tokenizer.Tokenizer() span_t_sents = [tok.span_tokenize(s) for s in clean_sents] assert len(span_t_sents) == len(clean_sents) t_sents = list() # prepare Mbt data for ts, s in zip(span_t_sents, clean_sents): t_sent = [s[slice(*t)] for t in ts] t_sent.append('<utt>') t_sents.append(t_sent) assert len(t_sents) == len(clean_sents) # call Mbt -s mbt_filename < t_sents p = subprocess.Popen(['/usr/bin/Mbt', '-s', mbt_filename], stdin=subprocess.PIPE, stdout=subprocess.PIPE) formatted_data = bytes('\n'.join(functools.reduce(op.concat, t_sents, [])), 'utf8') pos_tagged_data = p.communicate(formatted_data)[0] pos_tagged_sents = str(pos_tagged_data, 'utf8').split('<utt>') pos_tagged_sents = [s.strip() for s in pos_tagged_sents if s] pos_tagged_sents = [s for s in pos_tagged_sents if s] # print(pos_tagged_sents) assert_msg = 'len(pos_tagged_sents) == ' + str(len(pos_tagged_sents)) + '\nlen(clean_sents) == ' + str(len(clean_sents)) assert len(pos_tagged_sents) == len(clean_sents), assert_msg ## sys.stdout.writelines([s+'\n' for s in pos_tagged_sents]) # transform sents into lists of token-tuples pos_tuple_sents = list() for sent in pos_tagged_sents: tokens = sent.split() pos_tuple_sents.append([str2tuple(t) for t in tokens]) token_sents = list() assert len(span_t_sents) == len(pos_tuple_sents) for span_sent, tag_sent, clean_sent in zip(span_t_sents, pos_tuple_sents, clean_sents): assert len(span_sent) == len(tag_sent) for span_t, (tag_w, tag_t) in zip(span_sent, tag_sent): assert tag_w == clean_sent[slice(*span_t)] token_sents.append([(s, e, t) for (s, e), (w, t) in zip(span_sent, tag_sent)]) feature_records = list() neg_tagger = negex.Tagger() assert len(clean_sents) == len(finding_tokens) for sent, token_sent, finding_tok in zip(clean_sents, token_sents, finding_tokens): neg_tags = neg_tagger.neg_tag(sent) final_sent_tokens = merge_taglists([finding_tok], neg_tags) # print(final_sent_tokens) final_sent_tokens = merge_taglists(final_sent_tokens, token_sent) # print(final_sent_tokens) finding_index = final_sent_tokens.index(finding_tok) final_sent_tags = (tok[2] for tok in final_sent_tokens) change_comma = lambda t: 'comma' if t == ',' else t final_sent_tags = [change_comma(tag) for tag in final_sent_tags] feat_previous = final_sent_tags[max(0, finding_index-FEATURE_WINDOW):finding_index] previous_remaining = FEATURE_WINDOW - len(feat_previous) if previous_remaining: feat_previous = previous_remaining * ['NULL'] + feat_previous feat_following = final_sent_tags[finding_index+1:finding_index+1+FEATURE_WINDOW] following_remaining = FEATURE_WINDOW - len(feat_following) if following_remaining: feat_following += following_remaining * ['NULL'] feature_records.append(','.join(feat_previous + ['FIND'] + feat_following)) if option == 'train': feature_records = (f + ',' + c for f, c in zip(feature_records, classes)) sys.stdout.writelines([f+'\n' for f in feature_records])
def annotate(self, sentence, token_window=-1): """Returns the sentence annotated with negation classification. Arguments: sentence - string to be tagged. token_window - the number of tokens, to the left or right of the trigger, inside the scope of the trigger term. If it is less than or equal zero, the scope goes until the end or beginning of the sentence. """ self.original_sentence = sentence all_spans = list() # tokenize all findings all_spans = self.word_tokenizer.tokenize_findings(sentence, all_spans) # Identify triggers and tag them neg_tags = self.neg_tag(sentence) all_spans = internals.merge_taglists(all_spans, neg_tags) # tokenize the rest token_spans = self.word_tokenizer.span_tokenize(sentence) all_spans = internals.merge_taglists(all_spans, [(s, e, 'TOKE') for s, e in token_spans]) triggers = list() findings = dict() for i, tok in enumerate(all_spans): tagname = all_spans[i][2] trigger_tag_match = self.trigger_tag_pat.search(tagname) if trigger_tag_match: trigger_scope = self._scope(all_spans, i, tagname, token_window) if len(trigger_scope) > 0: scope_start = trigger_scope[0][0] scope_end = trigger_scope[-1][1] name = sentence[all_spans[i][0]:all_spans[i][1]] trig = Trigger(name, tagname) trig.set_scope(sentence[scope_start:scope_end]) trig.set_span(scope_start, scope_end) triggers.append(trig) elif tagname == 'FIND': f_start = all_spans[i][0] f_end = all_spans[i][1] f_text = sentence[f_start:f_end] f_key = (f_start, f_end) findings[f_key] = Finding(f_text, f_start, f_end, ClassificationEnum.Affirmative) t_keyfunc = lambda t: self.precedence[t.tag()] triggers.sort(key=t_keyfunc) t_group = list() for k, g in itertools.groupby(triggers, t_keyfunc): t_group.append(list(g)) findings_triggers = dict() # for overlapping scopes, the ascending order of precendence is POSP, PREP, POST, and PREN. for g in t_group: for t in g: t_span = t.span() for k, finding in findings.items(): if k[0] >= t_span[0] and k[1] <= t_span[1]: # in the scope of t finding.set_classification(t.classification()) findings_triggers[finding] = t return AnnotatedSentence(sentence, list(findings.values()), triggers, findings_triggers)