if opts.onlyfold is not None and i != opts.onlyfold: continue left = i*foldlen right = min(totalnum, (i+1)*foldlen) print >> logs, i, "\t [%d, %d) \t%d lines" % (left, right, right-left) os.system("mkdir " + thisdir[i]) infold_input = open(thisdir[i] + "/toparse.ecinput", "wt") outfold_gold = open(thisdir[i] + "/totrain.cleangold", "wt") infold_gold = open(thisdir[i] + "/toparse.cleangold", "wt") for j, line in enumerate(goldtrees[left : right]): start = "<s small.%d.%d>" % (i, j+1) print >> infold_input, start, " ".join(words_from_line(line)), "</s>" print >> infold_gold, "".join(goldtrees[left : right]), print >> outfold_gold, "".join(goldtrees[:left] + goldtrees[right:]), ####### TRAINING ***************** print >> logs, "**************************** training folds ************************" if opts.fromstep <= 1: traindir = os.environ["HOME"] + "/rerank/first-stage/TRAIN" trainscript = traindir + "/allScript" for i in xrange(opts.numfolds):
#!/usr/bin/env python import sys from utility import num_words, words_from_line ## cat trees.txt | filter_by_length.py [-w] [<max_len>] if __name__ == "__main__": print_words = False if sys.argv[1] == "-w": ## words print_words = True del sys.argv[1] try: max_len = int(sys.argv[1]) except: max_len = 400 for line in sys.stdin: words = words_from_line(line) length = len(words) if length <= max_len: print " ".join(words) if print_words else line.strip()