opt_parser.add_argument( '--catvals', default=None, help= 'Print category=value pairs. The option can be "UD", "langspec", or "UD+langspec". This distinction is based on the feature, not the value.' ) opt_parser.add_argument( '--sort', default='freq', help= 'Sort the values by their frequency (freq) or alphabetically (alph). Default: %(default)s.' ) args = opt_parser.parse_args() #Parsed command-line arguments args.output = "-" inp, out = file_util.in_out(args, multiple_files=True) trees = file_util.trees(inp) stats = Stats() try: for comments, tree in trees: stats.tree_count += 1 for cols in tree: stats.count_cols(cols) except: traceback.print_exc() print >> sys.stderr, "\n\n ------- STATS MAY BE EMPTY OR INCOMPLETE ----------" pass if args.stats: stats.print_basic_stats(out) if args.jsonstats: d = stats.get_stats()
def sent_set(inp): sents = {} #key: sentence text value: count for comment, lines in file_util.trees(inp): txt = u" ".join(line[FORM] for line in lines if line[ID].isdigit()) sents[txt] = sents.get(txt, 0) + 1 return sents
if cols[DEPS]!=u"_": #need to renumber secondary deps new_pairs=[] for head_deprel in cols[DEPS].split(u"|"): head,deprel=head_deprel.split(u":") new_pairs.append(word_ids[int(head)]+u":"+deprel) cols[DEPS]=u"|".join(new_pairs) if __name__=="__main__": opt_parser = argparse.ArgumentParser(description='Conversion script from word-based CoNLL-U to other formats.') opt_parser.add_argument('input', nargs='?', help='Input file name, or "-" or nothing for standard input.') opt_parser.add_argument('output', nargs='?', help='Output file name, or "-" or nothing for standard output.') opt_parser.add_argument('-f','--output-format', default="dgraph", help='Output format. Currently supported: dgraph (CoreNLP dep output). Default: %(default)s.') args = opt_parser.parse_args() #Parsed command-line arguments inp,out=file_util.in_out(args) for comments,tree in file_util.trees(inp): deps=set() #A set of (gov,dep,dType) where gov and dep are zero-based indices for line in tree: if not line[ID].isdigit(): #token line, skip continue if line[HEAD] not in (u"_",u"0"): deps.add((int(line[HEAD])-1,int(line[ID])-1,line[DEPREL])) #Process also the DEPS field if line[DEPS]!=u"_": for head_col_deprel in line[DEPS].split(u"|"): head,deprel=head_col_deprel.split(u":",1) deps.add((int(head)-1,int(line[ID])-1,line[DEPREL])) #Done. Maybe these should be sorted somehow? Also, what to do if we have no deps? for gov,dep,deprel in sorted(deps): print >> out, u"%s(%s-%d, %s-%d)"%(deprel,tree[gov][FORM],gov+1,tree[dep][FORM],dep+1) print >> out
help='Input file name, or "-" or nothing for standard input.') opt_parser.add_argument( 'output', nargs='?', help='Output file name, or "-" or nothing for standard output.') opt_parser.add_argument( '-f', '--output-format', default="dgraph", help= 'Output format. Currently supported: dgraph (CoreNLP dep output). Default: %(default)s.' ) args = opt_parser.parse_args() #Parsed command-line arguments inp, out = file_util.in_out(args) for comments, tree in file_util.trees(inp): deps = set( ) #A set of (gov,dep,dType) where gov and dep are zero-based indices for line in tree: if not line[ID].isdigit(): #token line, skip continue if line[HEAD] not in (u"_", u"0"): deps.add( (int(line[HEAD]) - 1, int(line[ID]) - 1, line[DEPREL])) #Process also the DEPS field if line[DEPS] != u"_": for head_col_deprel in line[DEPS].split(u"|"): head, deprel = head_col_deprel.split(u":", 1) deps.add((int(head) - 1, int(line[ID]) - 1, line[DEPREL])) #Done. Maybe these should be sorted somehow? Also, what to do if we have no deps? for gov, dep, deprel in sorted(deps):
print >> out, cat_is_val if __name__=="__main__": opt_parser = argparse.ArgumentParser(description='Script for basic stats generation. Assumes a validated input.') opt_parser.add_argument('input', nargs='+', help='Input file name (can be several files), or "-" or nothing for standard input.') opt_parser.add_argument('--stats',action='store_true',default=False, help='Print basic stats') opt_parser.add_argument('--jsonstats',action='store_true',default=False, help='Print basic stats as json dictionary') opt_parser.add_argument('--deprels',default=None,help='Print deprels. The option can be "UD", "langspec", or "UD+langspec".') opt_parser.add_argument('--catvals',default=None,help='Print category=value pairs. The option can be "UD", "langspec", or "UD+langspec". This distinction is based on the feature, not the value.') opt_parser.add_argument('--sort',default='freq',help='Sort the values by their frequency (freq) or alphabetically (alph). Default: %(default)s.') args = opt_parser.parse_args() #Parsed command-line arguments args.output="-" inp,out=file_util.in_out(args,multiple_files=True) trees=file_util.trees(inp) stats=Stats() try: for comments,tree in trees: stats.tree_count+=1 for cols in tree: stats.count_cols(cols) except: traceback.print_exc() print >> sys.stderr, "\n\n ------- STATS MAY BE EMPTY OR INCOMPLETE ----------" pass if args.stats: stats.print_basic_stats(out) if args.jsonstats: d=stats.get_stats()
def sent_set(inp): sents={} #key: sentence text value: count for comment,lines in file_util.trees(inp): txt=u" ".join(line[FORM] for line in lines if line[ID].isdigit()) sents[txt]=sents.get(txt,0)+1 return sents