Example #1
0
def process_file(json_filename, nb):
    docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', json_filename).groups()
    sentNr = int(sentNr)
    data = json.load(open(json_filename))
    data['nom'] = []

    # index adjustments for consistency with ontonotes parses
    ptb_tree = Tree.parse(data['ptbparse'])
    ptbstring = tree_to_string(ptb_tree) # wrap traces

    onftree = Tree.parse(data['goldparse'])
    onfstring = tree_to_string(onftree) # wrap traces
    raw_onfstring = tree_to_string(onftree, wrap_traces=False)

    ptbstring_tok = add_spaces(ptbstring, onfstring)

    tokenize_offsets = split_offsets(ptbstring, ptbstring_tok)
    trace_offsets = Offset(ptbstring_tok.split(), onfstring.split(), ignore_braces=True)

    #print ptbstring
    #print ptbstring_tok
    #print onfstring
    #print tokenize_offsets
    #print trace_offsets

    pt = SpanTree.parse(data['ptbparse'])

    for nb_data in nb[docId][sentNr]:
        args = nb_data['args']

	# TODO: arguments that are chains or concatenations of multiple nodes

        new_args = []
        for pos, role in args:
            words, start, end = [], None, None
            leaf_id, depth = pt.parse_pos(pos)
            if leaf_id != None and depth != None:
                treepos = pt.get_treepos(leaf_id, depth)
                while is_trace(pt[treepos]):
                    trace_id = int(pt[treepos].leaves()[0].split('-')[-1])
                    print 'looking for trace', trace_id
                    tracepos = pt.find_trace(trace_id)
                    if tracepos != None:
                        print 'trace %s found! Here:', tracepos
                        print pt[tracepos].pprint()
                        treepos = tracepos
                    else:
                        break # could not follow trace

                words = pt[treepos].leaves()
                start, end = span_from_treepos(pt, treepos)
                #print start, end,

                # adjust of different tokenization
                assert start in tokenize_offsets
                start = min(tokenize_offsets[start])
                assert end in tokenize_offsets
                end = max(tokenize_offsets[end])

                # adjust of inserted traces in ontonotes
                start = trace_offsets.map_to_longer(start)
                end = trace_offsets.map_to_longer(end)
                #print '->', start, end

            phrase = ''
            if words:
                phrase = ' '.join(raw_onfstring.split()[start:end+1])
            new_args.append( [role, pos, start, end, phrase] )

        nb_data['args'] = new_args
        data['nom'].append(nb_data)

        #print nb_data
    json.dump(data, open(json_filename, 'w'), indent=2, sort_keys=True)
Example #2
0
import sys
import json
from offset import Offset
from brackets import escape_brackets, unescape_brackets

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('json', action='store', help="json input file")
    parser.add_argument('jsonout', action='store', help="json output file")
    arguments = parser.parse_args(sys.argv[1:])

    data = json.load(open(arguments.json))

    cleanstring = escape_brackets(data['text']).split()
    tracestring = escape_brackets(data['treebank_sentence']).split()
    off = Offset(cleanstring, tracestring)

    words = data['words']
    #print 'c:', cleanstring
    #print 't:', tracestring
    for i, w in enumerate(words):
        lemma = escape_brackets(w[0])
        assert len(w) == 2
        adjusted_idx = off.map_to_longer(i)
        assert lemma == cleanstring[i]
        assert lemma == tracestring[adjusted_idx]
        w[1]['idx'] = adjusted_idx

    json.dump(data, open(arguments.jsonout,'w'), indent=2, sort_keys=True)