def validate(inp,out,args,tag_sets): for comments,tree in trees(inp,tag_sets,args): #the individual lines have been validated already in trees() #here go tests which are done on the whole tree validate_ID_sequence(tree) validate_ID_references(tree) validate_token_ranges(tree) validate_root(tree) validate_deps(tree) validate_tree(tree) if args.echo_input: file_util.print_tree(comments,tree,out) validate_newlines(inp)
def validate(inp, out, args, tag_sets): for comments, tree in trees(inp, tag_sets, args): #the individual lines have been validated already in trees() #here go tests which are done on the whole tree validate_ID_sequence(tree) validate_ID_references(tree) validate_token_ranges(tree) validate_root(tree) validate_deps(tree) validate_tree(tree) if args.echo_input: file_util.print_tree(comments, tree, out) validate_newlines(inp)
def validate(inp,out,args,tag_sets,known_sent_ids): global tree_counter for comments,tree in trees(inp,tag_sets,args): tree_counter+=1 #the individual lines have been validated already in trees() #here go tests which are done on the whole tree validate_ID_sequence(tree) validate_ID_references(tree) validate_token_ranges(tree) validate_root(tree) validate_deps(tree) validate_tree(tree) validate_sent_id(comments,known_sent_ids,args.lang) if args.check_tree_text: validate_text_meta(comments,tree) if args.echo_input: file_util.print_tree(comments,tree,out) validate_newlines(inp)
def validate(inp, out, args, tag_sets, known_sent_ids): global tree_counter for comments, tree in trees(inp, tag_sets, args): tree_counter += 1 #the individual lines have been validated already in trees() #here go tests which are done on the whole tree validate_ID_sequence(tree) validate_ID_references(tree) validate_token_ranges(tree) validate_root(tree) validate_deps(tree) validate_tree(tree) validate_sent_id(comments, known_sent_ids, args.lang) if args.check_tree_text: validate_text_meta(comments, tree) if args.echo_input: file_util.print_tree(comments, tree, out) validate_newlines(inp)
line_idx+=1 for word_idx,_ in enumerate(range(b,e+1)): #consume as many lines as there are words in the token word_ids.append("%d.%d"%(token_idx+1,word_idx+1)) wtree[line_idx][ID]=word_ids[-1] line_idx+=1 #word_ids is now a list with 1-based indexing which has the new ID for every single word #the ID column has been renumbered by now #now we can renumber all of the HEAD columns for cols in wtree: if cols[HEAD]==u"_": #token continue cols[HEAD]=word_ids[int(cols[HEAD])] if cols[DEPS]!=u"_": #need to renumber secondary deps new_pairs=[] for head_deprel in cols[DEPS].split(u"|"): head,deprel=head_deprel.split(u":") new_pairs.append(word_ids[int(head)]+u":"+deprel) cols[DEPS]=u"|".join(new_pairs) if __name__=="__main__": opt_parser = argparse.ArgumentParser(description='Conversion script from word-based CoNLL-U to token-based CoNLL-U. This script assumes that the input is validated and does no checking on its own.') opt_parser.add_argument('input', nargs='?', help='Input file name, or "-" or nothing for standard input.') opt_parser.add_argument('output', nargs='?', help='Output file name, or "-" or nothing for standard output.') args = opt_parser.parse_args() #Parsed command-line arguments inp,out=file_util.in_out(args) for comments,tree in file_util.trees(inp): w2t(tree) file_util.print_tree(comments,tree,out)