def _featurise_negated(_id, id_to_ann, doc_base, txt_doc): e_ann = id_to_ann[_id] trigg_ann = id_to_ann[e_ann.trigger] # ^ does not occur in the train and dev sets, so it is relatively safe: # sed -e 's| ||g' -e 's|\(.\)|\1\n|g' \ # res/ann/{epi,id,ge}/{train,dev}/*.txt | sed '/^$/d' | sort \ # | uniq -c | sort -n -r | grep '\^' trigger_text = trigg_ann.comment.replace(' ', '^') # Feature: The actual text yield 'TRIGGER-TEXT-{}'.format(trigger_text) # Feature: Prefixes for size in xrange(2, min(8, len(trigger_text) + 1)): yield 'TRIGGER-PREFIX-{}-{}'.format(size, trigger_text[:size]) # XXX: Not efficient! Re-done for every f-ing ann! ee_anns = [a for a in id_to_ann.itervalues()] nesp_path = doc_base + '.nesp.st' from lib.ann import parse_ann with open(nesp_path, 'r') as nesp_file: # XXX: Remove the ugly hack to protect us from newlines! nesp_anns = [a for a in parse_ann(l.rstrip('\n') for l in nesp_file if l.strip())] from lib.heuristic import nesp_heuristic from collections import defaultdict for mark in nesp_heuristic(ee_anns, nesp_anns): if mark.target == _id: heuristic_base = 'HEURISTIC-{}-{}'.upper().format(mark.type, 'ROOT' if mark.root else 'NON-ROOT') yield heuristic_base yield heuristic_base + '-' + mark.cue.comment.replace(' ', '^') for span_token in mark.span.comment.split(): yield heuristic_base + '-' + span_token # Event type, actually boosts us a bit, we'll come back to this later # XXX: Causes negation to spike with precision, hurts speculation #yield 'EVENT-TYPE-{}'.format(e_ann.type) # Contextual features sentence = txt_doc.sentence_by_offset(trigg_ann.start) # BoW for the trigger context trigger_token = sentence.token_by_offset(trigg_ann.start) trigger_token_index = sentence.tokens.index(trigger_token) pre_trigg_toks = sentence.tokens[:trigger_token_index] for pre_trigg_tok in pre_trigg_toks[-3:]: yield 'PRE-TRIGG-BOW:{}'.format(pre_trigg_tok.text.replace(' ', '^')) # XXX: Hurts performance, as expected #for sent_tok in sentence.tokens: # yield 'SENT-BOW:{}'.format(sent_tok.text.replace(' ', '^')) post_trigg_toks = sentence.tokens[trigger_token_index + 1:] post_trigg_toks.reverse() for post_trigg_tok in post_trigg_toks[-3:]: yield 'POST-TRIGG-BOW:{}'.format(post_trigg_tok.text.replace(' ', '^'))
def main(args): argp = ARGPARSER.parse_args(args[1:]) ee_anns = [a for a in parse_ann(l.rstrip('\n') for l in argp.event_extraction_output) if not isinstance(a, str)] nesp_anns = [a for a in parse_ann(l.rstrip('\n') for l in argp.negation_speculation_output)] next_m_num = max(chain((int(a.id[1:]) for a in ee_anns if isinstance(a, Modifier)), (0, ))) + 1 for mark in nesp_heuristic(ee_anns, nesp_anns, root_internal=True): argp.output.write(unicode(Modifier('M{}'.format(next_m_num), mark.type, mark.target)) + '\n') next_m_num += 1 return 0