Exemple #1
0
def frequency_in_json(stream):
    freq = defaultdict(int)

    for sent in stream.read().splitlines():
        for w in leaves(json.loads(sent)):
            freq[w] += 1

    return freq
Exemple #2
0
#!/Users/husnusensoy/Downloads/pypy-2.0-beta2/bin/pypy

import json
import sys
from tree2sentence import leaves

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Filter a JSON tree file by length')
    parser.add_argument("file", type=str, help="File to be filtered")
    parser.add_argument("max", type=int, help="Maximum length of a file")

    args = parser.parse_args()

    with open(args.file, "r") as fp:
        for t in filter(lambda t: len(leaves(json.loads(t))) <= args.max, fp):
            sys.stdout.write("%s" % t)
Exemple #3
0
    parser = argparse.ArgumentParser(description='Evaluate the bracketing metric of parser')
    parser.add_argument('key_file', help='Key file')
    parser.add_argument('output_file', help='Output file')
    parser.add_argument("--unary", help="Include unary brackets", action="store_true")
    parser.add_argument('--root', help="Include root bracket covering full sentence", action="store_false")

    args = parser.parse_args()

    assert args.key_file[-4:] == ".key"

    measure = Measure()
    with open(args.key_file, "rb") as key_fp, open(args.output_file, "rb") as model_fp:
        for key, model in zip(key_fp, model_fp):
            if model.strip() != '#####':
                key_json = json.loads(key)
                lvs = leaves(key_json)
                key_bracket = brackets(key_json, lvs)

                if not args.unary:
                    key_bracket = filter(lambda tuple: tuple[1] - tuple[0] > 1, key_bracket)

                if not args.root:
                    key_bracket = filter(lambda tuple: tuple[1] - tuple[0] != len(lvs), key_bracket)

                key_bracket_set = set(key_bracket)

                model_json = json.loads(model)
                lvs = leaves(model_json)
                model_bracket = brackets(model_json, lvs)

                if not args.unary:
Exemple #4
0
parser.add_argument('--maxlength', type=int, help="Maximum sentence length on validation set", default=10000)

args = parser.parse_args()

assert 0.0 < args.trainratio < 1.0

with open(args.file, "r") as fp:
    sentence_lst = [line for line in fp]

assert isinstance(args.trainratio, float)
ntrain = int(math.ceil(len(sentence_lst) * args.trainratio))

root, ext = os.path.splitext(args.file)

if ext != ".json":
    import sys

    sys.stderr.write("Unexpected file extension %s\n" % (ext))
    sys.exit(1)

with open("%s.l%d.dev" % (root,args.maxlength), "w") as devp, open("%s.l%d.key" % (root,args.maxlength), "w") as keyp, open("%s.l%d.train" % (root,args.maxlength), "w") as trainp:
    for s in sentence_lst[:ntrain]:
        trainp.write(s)

    for s in filter(lambda t: len(leaves(json.loads(t), False)) <= args.maxlength, sentence_lst[ntrain:]):
        keyp.write(s)

        json.dump(leaves(json.loads(s), args.tagged), devp)
        devp.write("\n")