def frequency_in_json(stream): freq = defaultdict(int) for sent in stream.read().splitlines(): for w in leaves(json.loads(sent)): freq[w] += 1 return freq
#!/Users/husnusensoy/Downloads/pypy-2.0-beta2/bin/pypy import json import sys from tree2sentence import leaves if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Filter a JSON tree file by length') parser.add_argument("file", type=str, help="File to be filtered") parser.add_argument("max", type=int, help="Maximum length of a file") args = parser.parse_args() with open(args.file, "r") as fp: for t in filter(lambda t: len(leaves(json.loads(t))) <= args.max, fp): sys.stdout.write("%s" % t)
parser = argparse.ArgumentParser(description='Evaluate the bracketing metric of parser') parser.add_argument('key_file', help='Key file') parser.add_argument('output_file', help='Output file') parser.add_argument("--unary", help="Include unary brackets", action="store_true") parser.add_argument('--root', help="Include root bracket covering full sentence", action="store_false") args = parser.parse_args() assert args.key_file[-4:] == ".key" measure = Measure() with open(args.key_file, "rb") as key_fp, open(args.output_file, "rb") as model_fp: for key, model in zip(key_fp, model_fp): if model.strip() != '#####': key_json = json.loads(key) lvs = leaves(key_json) key_bracket = brackets(key_json, lvs) if not args.unary: key_bracket = filter(lambda tuple: tuple[1] - tuple[0] > 1, key_bracket) if not args.root: key_bracket = filter(lambda tuple: tuple[1] - tuple[0] != len(lvs), key_bracket) key_bracket_set = set(key_bracket) model_json = json.loads(model) lvs = leaves(model_json) model_bracket = brackets(model_json, lvs) if not args.unary:
parser.add_argument('--maxlength', type=int, help="Maximum sentence length on validation set", default=10000) args = parser.parse_args() assert 0.0 < args.trainratio < 1.0 with open(args.file, "r") as fp: sentence_lst = [line for line in fp] assert isinstance(args.trainratio, float) ntrain = int(math.ceil(len(sentence_lst) * args.trainratio)) root, ext = os.path.splitext(args.file) if ext != ".json": import sys sys.stderr.write("Unexpected file extension %s\n" % (ext)) sys.exit(1) with open("%s.l%d.dev" % (root,args.maxlength), "w") as devp, open("%s.l%d.key" % (root,args.maxlength), "w") as keyp, open("%s.l%d.train" % (root,args.maxlength), "w") as trainp: for s in sentence_lst[:ntrain]: trainp.write(s) for s in filter(lambda t: len(leaves(json.loads(t), False)) <= args.maxlength, sentence_lst[ntrain:]): keyp.write(s) json.dump(leaves(json.loads(s), args.tagged), devp) devp.write("\n")