def process_file(json_filename, nb): docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', json_filename).groups() sentNr = int(sentNr) data = json.load(open(json_filename)) data['nom'] = [] # index adjustments for consistency with ontonotes parses ptb_tree = Tree.parse(data['ptbparse']) ptbstring = tree_to_string(ptb_tree) # wrap traces onftree = Tree.parse(data['goldparse']) onfstring = tree_to_string(onftree) # wrap traces raw_onfstring = tree_to_string(onftree, wrap_traces=False) ptbstring_tok = add_spaces(ptbstring, onfstring) tokenize_offsets = split_offsets(ptbstring, ptbstring_tok) trace_offsets = Offset(ptbstring_tok.split(), onfstring.split(), ignore_braces=True) #print ptbstring #print ptbstring_tok #print onfstring #print tokenize_offsets #print trace_offsets pt = SpanTree.parse(data['ptbparse']) for nb_data in nb[docId][sentNr]: args = nb_data['args'] # TODO: arguments that are chains or concatenations of multiple nodes new_args = [] for pos, role in args: words, start, end = [], None, None leaf_id, depth = pt.parse_pos(pos) if leaf_id != None and depth != None: treepos = pt.get_treepos(leaf_id, depth) while is_trace(pt[treepos]): trace_id = int(pt[treepos].leaves()[0].split('-')[-1]) print 'looking for trace', trace_id tracepos = pt.find_trace(trace_id) if tracepos != None: print 'trace %s found! Here:', tracepos print pt[tracepos].pprint() treepos = tracepos else: break # could not follow trace words = pt[treepos].leaves() start, end = span_from_treepos(pt, treepos) #print start, end, # adjust of different tokenization assert start in tokenize_offsets start = min(tokenize_offsets[start]) assert end in tokenize_offsets end = max(tokenize_offsets[end]) # adjust of inserted traces in ontonotes start = trace_offsets.map_to_longer(start) end = trace_offsets.map_to_longer(end) #print '->', start, end phrase = '' if words: phrase = ' '.join(raw_onfstring.split()[start:end+1]) new_args.append( [role, pos, start, end, phrase] ) nb_data['args'] = new_args data['nom'].append(nb_data) #print nb_data json.dump(data, open(json_filename, 'w'), indent=2, sort_keys=True)
def span_from_treepos(tree, treepos): st = SpanTree.parse(str(tree)) st.convert() start = min(st[treepos].leaves()) end = max(st[treepos].leaves()) return (start, end)
pb = None if os.path.isfile(arguments.probbank): pb = read_onprop(arguments.probbank) else: prop_bank_prob = arguments.probbank.replace(".prop",".pprop") assert os.path.isfile(prop_bank_prob) pb = read_onprop(prop_bank_prob) assert pb != None docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', arguments.json).groups() sentNr = int(sentNr) data = json.load(open(arguments.json)) data['prop'] = [] pt = SpanTree.parse(data['goldparse']) for propS in pb[docId][sentNr]: prop = parse_onprop(propS) args = prop['args'] # TODO: concatenated arguments (comma-separated positions used if the argument is not a constituent) # currently these appear in the output with null start and end positions support2main = {} # for LINK-PCR and LINK-SLC arguments, there is a relativizer or empty element which # I am calling a "support" node; this "support" is associated with a normal argument # and the link associates it with the main node. In the output, supporting nodes are