Beispiel #1
0
def process_file(json_filename, nb):
    docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', json_filename).groups()
    sentNr = int(sentNr)
    data = json.load(open(json_filename))
    data['nom'] = []

    # index adjustments for consistency with ontonotes parses
    ptb_tree = Tree.parse(data['ptbparse'])
    ptbstring = tree_to_string(ptb_tree) # wrap traces

    onftree = Tree.parse(data['goldparse'])
    onfstring = tree_to_string(onftree) # wrap traces
    raw_onfstring = tree_to_string(onftree, wrap_traces=False)

    ptbstring_tok = add_spaces(ptbstring, onfstring)

    tokenize_offsets = split_offsets(ptbstring, ptbstring_tok)
    trace_offsets = Offset(ptbstring_tok.split(), onfstring.split(), ignore_braces=True)

    #print ptbstring
    #print ptbstring_tok
    #print onfstring
    #print tokenize_offsets
    #print trace_offsets

    pt = SpanTree.parse(data['ptbparse'])

    for nb_data in nb[docId][sentNr]:
        args = nb_data['args']

	# TODO: arguments that are chains or concatenations of multiple nodes

        new_args = []
        for pos, role in args:
            words, start, end = [], None, None
            leaf_id, depth = pt.parse_pos(pos)
            if leaf_id != None and depth != None:
                treepos = pt.get_treepos(leaf_id, depth)
                while is_trace(pt[treepos]):
                    trace_id = int(pt[treepos].leaves()[0].split('-')[-1])
                    print 'looking for trace', trace_id
                    tracepos = pt.find_trace(trace_id)
                    if tracepos != None:
                        print 'trace %s found! Here:', tracepos
                        print pt[tracepos].pprint()
                        treepos = tracepos
                    else:
                        break # could not follow trace

                words = pt[treepos].leaves()
                start, end = span_from_treepos(pt, treepos)
                #print start, end,

                # adjust of different tokenization
                assert start in tokenize_offsets
                start = min(tokenize_offsets[start])
                assert end in tokenize_offsets
                end = max(tokenize_offsets[end])

                # adjust of inserted traces in ontonotes
                start = trace_offsets.map_to_longer(start)
                end = trace_offsets.map_to_longer(end)
                #print '->', start, end

            phrase = ''
            if words:
                phrase = ' '.join(raw_onfstring.split()[start:end+1])
            new_args.append( [role, pos, start, end, phrase] )

        nb_data['args'] = new_args
        data['nom'].append(nb_data)

        #print nb_data
    json.dump(data, open(json_filename, 'w'), indent=2, sort_keys=True)
Beispiel #2
0
def span_from_treepos(tree, treepos):
    st = SpanTree.parse(str(tree))
    st.convert()
    start = min(st[treepos].leaves())
    end = max(st[treepos].leaves())
    return (start, end)
Beispiel #3
0
    pb = None
    if os.path.isfile(arguments.probbank):
        pb = read_onprop(arguments.probbank)
    else:
        prop_bank_prob = arguments.probbank.replace(".prop",".pprop")
        assert os.path.isfile(prop_bank_prob)
        pb = read_onprop(prop_bank_prob)
    assert pb != None

    docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', arguments.json).groups()
    sentNr = int(sentNr)
    data = json.load(open(arguments.json))
    data['prop'] = []

    pt = SpanTree.parse(data['goldparse'])

    

    for propS in pb[docId][sentNr]:
        prop = parse_onprop(propS)
        args = prop['args']

	# TODO: concatenated arguments (comma-separated positions used if the argument is not a constituent)
	# currently these appear in the output with null start and end positions
        
        
        support2main = {}
	# for LINK-PCR and LINK-SLC arguments, there is a relativizer or empty element which 
	# I am calling a "support" node; this "support" is associated with a normal argument 
	# and the link associates it with the main node. In the output, supporting nodes are