def main(): args = parse_args() # input files wv_file = args.glove_dir + '/' + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_file = args.data_dir + '/train.jsonl' dev_file = args.data_dir + '/dev.jsonl' # test_file = args.data_dir + '/test.jsonl' train_tokens = load_tokens(train_file) dev_tokens = load_tokens(dev_file) # test_tokens = load_tokens(test_file) if args.lower: train_tokens, dev_tokens = [[t.lower() for t in tokens] for tokens in \ (train_tokens, dev_tokens)] # (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") # all_tokens = train_tokens + dev_tokens + test_tokens all_tokens = train_tokens + dev_tokens v = build_vocab(all_tokens, glove_vocab, args.min_freq) print("calculating oov...") # datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} datasets = {'train': train_tokens, 'dev': dev_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") if args.random: print("using random initialization...") embedding = random_embedding(v, wv_dim) else: embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.")
def prepare_vocab(data_dir, vocab_dir, spacy_model, glove_dir="dataset/glove", wv_file="glove.840B.300d.txt", wv_dim=300, min_freq=0, lower=True): # input files train_file = data_dir + '/train.json' dev_file = data_dir + '/dev.json' test_file = data_dir + '/test.json' wv_file = glove_dir + '/' + wv_file wv_dim = wv_dim # output files helper.ensure_dir(vocab_dir) vocab_file = vocab_dir + '/vocab.pkl' emb_file = vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens = load_tokens(train_file, spacy_model) dev_tokens = load_tokens(dev_file) test_tokens = load_tokens(test_file, spacy_model) if lower: train_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in \ (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, min_freq) print("calculating oov...") datasets = {'train': train_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.")
def prepare_voabulary (vocab_params): # input files train_file = vocab_params.data_dir + '/train.json' dev_file = vocab_params.data_dir + '/dev.json' test_file = vocab_params.data_dir + '/test.json' wv_file = vocab_params.glove_dir + '/' + vocab_params.glove_text_file wv_dim = vocab_params.emb_dim # output files helper.ensure_dir(vocab_params.vocab_dir) vocab_file = vocab_params.vocab_dir + vocab_params.vocab_file emb_file = vocab_params.vocab_dir + vocab_params.embed_file # load files print("loading files...") train_tokens = load_tokens(train_file) dev_tokens = load_tokens(dev_file) test_tokens = load_tokens(test_file) if vocab_params.lower: train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\ (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, vocab_params.min_freq) print("calculating oov...") datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov*100.0/total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.") return(vocab)
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.json' dev_file = args.data_dir + '/dev.json' test_file = args.data_dir + '/test.json' schema_file = args.data_dir + '/schemas.json' wv_file = args.emb_dir + '/' + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' char_file = args.vocab_dir + '/chars.json' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens = load_tokens(train_file) dev_tokens = load_tokens(dev_file) test_tokens = load_tokens(test_file) if args.lower: train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\ (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, args.min_freq) print("calculating oov...") datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping embeddings to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) # print("all done.") print("building schemas...") all_schemas = set() subj_type = set() obj_type = set() min_count = 2 pos_tags = set() chars = defaultdict(int) with open(train_file) as f: a = json.load(f) for ins in a: for spo in ins['spo_details']: all_schemas.add(spo[3]) subj_type.add(spo[2]) obj_type.add(spo[6]) for pos in ins['pos_tags']: pos_tags.add(pos) for token in ins['tokens']: for char in token: chars[char] += 1 id2predicate = {i + 1: j for i, j in enumerate(all_schemas)} # 0表示终止类别 predicate2id = {j: i for i, j in id2predicate.items()} id2subj_type = {i + 1: j for i, j in enumerate(subj_type)} # 0表示终止类别 subj_type2id = {j: i for i, j in id2subj_type.items()} id2obj_type = {i + 1: j for i, j in enumerate(obj_type)} # 0表示终止类别 obj_type2id = {j: i for i, j in id2obj_type.items()} with codecs.open(schema_file, 'w', encoding='utf-8') as f: json.dump([ id2predicate, predicate2id, id2subj_type, subj_type2id, id2obj_type, obj_type2id ], f, indent=4, ensure_ascii=False) print("dumping chars to files...") with codecs.open(char_file, 'w', encoding='utf-8') as f: chars = {i: j for i, j in chars.items() if j >= min_count} id2char = {i + 2: j for i, j in enumerate(chars)} # padding: 0, unk: 1 char2id = {j: i for i, j in id2char.items()} id2pos = {i + 2: j for i, j in enumerate(pos_tags)} # padding: 0, unk: 1 pos2id = {j: i for i, j in id2pos.items()} json.dump([id2char, char2id, id2pos, pos2id], f, indent=4, ensure_ascii=False)
def main(): args = parse_args() # input files train_file = args.data_dir + "/rationale_train.json" dev_file = args.data_dir + "/rationale_dev.json" un_file = args.data_dir + "/rationale_un.json" wl_file = args.data_dir + "/rationale_wl.json" cts_file = args.data_dir + "/rationale_cts.json" bc_file = args.data_dir + "/rationale_bc.json" wv_file = args.glove_dir + "/" + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + "/vocab.pkl" emb_file = args.vocab_dir + "/embedding.npy" # load files print("loading files...") train_tokens = load_tokens(train_file) dev_tokens = load_tokens(dev_file) un_tokens = load_tokens(un_file) wl_tokens = load_tokens(wl_file) cts_tokens = load_tokens(cts_file) bc_tokens = load_tokens(bc_file) if args.lower: train_tokens, dev_tokens, un_tokens, wl_tokens, cts_tokens, bc_tokens = [ [t.lower() for t in tokens] for tokens in ( train_tokens, dev_tokens, un_tokens, wl_tokens, cts_tokens, bc_tokens, ) ] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, args.min_freq) print("calculating oov...") datasets = { "train": train_tokens, "dev": dev_tokens, "un": un_tokens, "wl": wl_tokens, "cts": cts_tokens, "bc": bc_tokens, } for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, "wb") as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.")
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.jsonl' dev_file = args.data_dir + '/testa.jsonl' test_file = args.data_dir + '/testb.jsonl' wv_file = args.glove_dir + '/' + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' char_vocab_file = args.vocab_dir + '/vocab_char.pkl' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens, train_chars = load_tokens(train_file) dev_tokens, dev_chars = load_tokens(dev_file) test_tokens, test_chars = load_tokens(test_file) if args.lower: train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\ (train_tokens, dev_tokens, test_tokens)] if args.char_lower and train_chars: train_chars, dev_chars, test_chars = [[c.lower() for c in chars] for chars in\ (train_chars, dev_chars, test_chars)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") if args.all: all_tokens = train_tokens + dev_tokens + test_tokens else: all_tokens = train_tokens v = build_vocab(all_tokens, glove_vocab, args.min_freq) if train_chars: print("building vocab for chars...") all_chars = train_chars + dev_chars + test_chars char_counter = Counter(all_chars) #char_vocab = constant.VOCAB_PREFIX + sorted(char_counter.keys(), key=char_counter.get, reverse=True) char_vocab = constant.VOCAB_PREFIX + sorted(list(char_counter.keys())) print("vocab built with {} chars.".format(len(char_vocab))) else: char_vocab = None print("calculating oov...") datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") if args.random: print("using random initialization...") embedding = random_embedding(v, wv_dim) else: embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) if char_vocab: with open(char_vocab_file, 'wb') as outfile: pickle.dump(char_vocab, outfile) np.save(emb_file, embedding) print("all done.")