コード例 #1
0
    'nlvr2',
    'spotdiff',
    'adobe',
]

ds_root = "../dataset/"
for ds_name in DATASETS:
    print("Processing dataset %s" % ds_name)

    dataset = []
    for split_name in ['train', 'valid']:
        dataset.extend(
            json.load(
                open(os.path.join(ds_root, ds_name, split_name + ".json"))))
        print("Finish Loading split %s" % split_name)
    print("Number of data is %d." % len(dataset))
    sents = sum(map(lambda x: x["sents"], dataset), [])
    print("Number of sents is %d." % len(sents))

    tok = Tokenizer()
    tok.build_vocab(sents, min_occur=3)
    tok.dump(os.path.join(ds_root, ds_name, "vocab.txt"))

    wordXnum = list(tok.occur.items())
    wordXnum = sorted(wordXnum, key=lambda x: x[1], reverse=True)
    N = 50
    print("Top %d Words:" % N)
    for word, num in wordXnum[:N]:
        print("%s: %d" % (word, num))
    print()