コード例 #1
0
ファイル: featurise.py プロジェクト: ninjin/eepura
def _featurise_negated(_id, id_to_ann, doc_base, txt_doc):
    e_ann = id_to_ann[_id]
    trigg_ann = id_to_ann[e_ann.trigger]
    # ^ does not occur in the train and dev sets, so it is relatively safe:
    #    sed -e 's| ||g' -e 's|\(.\)|\1\n|g' \
    #        res/ann/{epi,id,ge}/{train,dev}/*.txt | sed '/^$/d' | sort \
    #        | uniq -c | sort -n -r | grep '\^'
    trigger_text = trigg_ann.comment.replace(' ', '^')

    # Feature: The actual text
    yield 'TRIGGER-TEXT-{}'.format(trigger_text)

    # Feature: Prefixes
    for size in xrange(2, min(8, len(trigger_text) + 1)):
        yield 'TRIGGER-PREFIX-{}-{}'.format(size, trigger_text[:size])

    # XXX: Not efficient! Re-done for every f-ing ann!
    ee_anns = [a for a in id_to_ann.itervalues()]
    nesp_path = doc_base + '.nesp.st'
    from lib.ann import parse_ann
    with open(nesp_path, 'r') as nesp_file:
        # XXX: Remove the ugly hack to protect us from newlines!
        nesp_anns = [a for a in parse_ann(l.rstrip('\n') for l in nesp_file if l.strip())]
    from lib.heuristic import nesp_heuristic
    from collections import defaultdict
    for mark in nesp_heuristic(ee_anns, nesp_anns):
        if mark.target == _id:
            heuristic_base = 'HEURISTIC-{}-{}'.upper().format(mark.type,
                    'ROOT' if mark.root else 'NON-ROOT')
            yield heuristic_base
            yield heuristic_base + '-' + mark.cue.comment.replace(' ', '^')
            for span_token in mark.span.comment.split():
                yield heuristic_base + '-' + span_token

    # Event type, actually boosts us a bit, we'll come back to this later
    # XXX: Causes negation to spike with precision, hurts speculation
    #yield 'EVENT-TYPE-{}'.format(e_ann.type)

    # Contextual features
    sentence = txt_doc.sentence_by_offset(trigg_ann.start)

    # BoW for the trigger context
    trigger_token = sentence.token_by_offset(trigg_ann.start)
    trigger_token_index = sentence.tokens.index(trigger_token)
    pre_trigg_toks = sentence.tokens[:trigger_token_index]

    for pre_trigg_tok in pre_trigg_toks[-3:]:
        yield 'PRE-TRIGG-BOW:{}'.format(pre_trigg_tok.text.replace(' ', '^'))

    # XXX: Hurts performance, as expected
    #for sent_tok in sentence.tokens:
    #    yield 'SENT-BOW:{}'.format(sent_tok.text.replace(' ', '^'))

    post_trigg_toks = sentence.tokens[trigger_token_index + 1:]
    post_trigg_toks.reverse()
    for post_trigg_tok in post_trigg_toks[-3:]:
        yield 'POST-TRIGG-BOW:{}'.format(post_trigg_tok.text.replace(' ', '^'))
コード例 #2
0
ファイル: heurenrich.py プロジェクト: ninjin/eepura
def main(args):
    argp = ARGPARSER.parse_args(args[1:])

    ee_anns = [a for a in parse_ann(l.rstrip('\n')
            for l in argp.event_extraction_output)
            if not isinstance(a, str)]
    nesp_anns = [a for a in parse_ann(l.rstrip('\n')
            for l in argp.negation_speculation_output)]

    next_m_num = max(chain((int(a.id[1:]) for a in ee_anns
            if isinstance(a, Modifier)), (0, ))) + 1

    for mark in nesp_heuristic(ee_anns, nesp_anns,
            root_internal=True):
        argp.output.write(unicode(Modifier('M{}'.format(next_m_num),
            mark.type, mark.target)) + '\n')
        next_m_num += 1

    return 0