def main(): args = parse_args() # input files train_file = args.data_dir + '/train.json' dev_file = args.data_dir + '/dev.json' test_file = args.data_dir + '/test.json' embedding_file = args.ucca_embedding_dir + '/' + args.ucca_embedding_file index_file = args.ucca_embedding_dir + '/' + args.ucca_embedding_index_file helper.ensure_dir(args.ucca_embedding_dir) UccaEmbedding.prepare(args.ucca_embedding_dim, [train_file, dev_file, test_file], index_file, embedding_file, args.ucca_embedding_source) return UccaEmbedding(args.ucca_embedding_dim, index_file, embedding_file)
exit(1) # Vocab vocab_file = model_dir + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) assert opt[ 'vocab_size'] == vocab.size, "Vocab size must match that in the saved model." # UCCA Embedding ucca_embedding = None if opt['ucca_embedding_dim'] > 0: embedding_file = opt['ucca_embedding_dir'] + '/' + opt[ 'ucca_embedding_file'] index_file = opt['ucca_embedding_dir'] + '/' + opt[ 'ucca_embedding_index_file'] ucca_embedding = UccaEmbedding(opt['ucca_embedding_dim'], index_file, embedding_file) data_file = opt['data_dir'] + '/{}.json'.format(args.dataset) with open(data_file) as infile: data_input = json.load(infile) data = DataLoader(data_input, opt['batch_size'], opt, vocab, evaluation=True, ucca_embedding=ucca_embedding) print("{} batches created for test".format(len(data.data))) model_data.append(data) evaluator = GCNEnsembleEvaluator(model_files)
# load vocab vocab_file = opt['vocab_dir'] + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) opt['vocab_size'] = vocab.size emb_file = opt['vocab_dir'] + '/embedding.npy' emb_matrix = np.load(emb_file) assert emb_matrix.shape[0] == vocab.size assert emb_matrix.shape[1] == opt['emb_dim'] # UCCA Embedding? ucca_embedding = None if args.ucca_embedding_dim > 0: embedding_file = args.ucca_embedding_dir + '/' + args.ucca_embedding_file index_file = args.ucca_embedding_dir + '/' + args.ucca_embedding_index_file ucca_embedding = UccaEmbedding(args.ucca_embedding_dim, index_file, embedding_file) opt['ucca_embedding_vocab_size'] = ucca_embedding.embedding_matrix.shape[0] assert ucca_embedding.embedding_matrix.shape[1] == args.ucca_embedding_dim # load data print("Loading data from {} with batch size {}...".format( opt['data_dir'], opt['batch_size'])) with open(opt['data_dir'] + '/train.json') as infile: train_input = json.load(infile) train_batch = DataLoader(train_input, opt['batch_size'], opt, vocab, evaluation=False, apply_filters=True, ucca_embedding=ucca_embedding)