def test_process(opt): opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model opt.batch_size = opt.b device = torch.device('cuda' if opt.cuda else 'cpu') data_class = SingleTurnDialog.load_class('OpenSubtitles') data_arg = Storage() data_arg.file_id = opt.datapath data_arg.min_vocab_times = 20 def load_dataset(data_arg, wvpath, embedding_size): dm = data_class(**data_arg) return dm opt.n_position = 100 dm = load_dataset(data_arg, None, opt.n_position) opt.n_src_vocab = dm.valid_vocab_len opt.n_trg_vocab = dm.valid_vocab_len opt.n_vocab_size = dm.valid_vocab_len opt.src_pad_idx = 0 opt.trg_pad_idx = 0 opt.pad_idx = 0 model = transformer_model(opt, device).to(device) if (opt.restore != None): checkpoint = torch.load(opt.restore) model.load_state_dict(checkpoint['net']) dl = cotk.dataloader.OpenSubtitles( opt.datapath, min_vocab_times=data_arg.min_vocab_times) test(model, dm, device, opt, dl)
def main(args): if args.debug: debug() if args.cuda: config = tf.ConfigProto() config.gpu_options.allow_growth = True else: config = tf.ConfigProto(device_count={'GPU': 0}) os.environ["CUDA_VISIBLE_DEVICES"] = "-1" data_class = SingleTurnDialog.load_class(args.dataset) wordvec_class = WordVector.load_class(args.wvclass) if wordvec_class == None: wordvec_class = Glove if args.cache: data = try_cache(data_class, (args.datapath, ), args.cache_dir) vocab = data.vocab_list embed = try_cache(lambda wv, ez, vl: wordvec_class(wv).load(ez, vl), (args.wvpath, args.embedding_size, vocab), args.cache_dir, wordvec_class.__name__) else: data = data_class(args.datapath) wv = wordvec_class(args.wvpath) vocab = data.vocab_list embed = wv.load(args.embedding_size, vocab) embed = np.array(embed, dtype=np.float32) with tf.Session(config=config) as sess: model = create_model(sess, data, args, embed) if args.mode == "train": model.train_process(sess, data, args) else: model.test_process(sess, data, args)
def main(args, load_exclude_set, restoreCallback): logging.basicConfig(\ filename=0,\ level=logging.DEBUG,\ format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\ datefmt='%H:%M:%S') if args.debug: debug() logging.info(json.dumps(args, indent=2)) cuda_init(0, args.cuda) volatile = Storage() volatile.load_exclude_set = load_exclude_set volatile.restoreCallback = restoreCallback data_class = SingleTurnDialog.load_class(args.dataset) data_arg = Storage() data_arg.file_id = args.datapath wordvec_class = WordVector.load_class(args.wvclass) if wordvec_class is None: wordvec_class = Glove def load_dataset(data_arg, wvpath, embedding_size): wv = wordvec_class(wvpath) dm = data_class(**data_arg) return dm, wv.load(embedding_size, dm.vocab_list) if args.cache: dm, volatile.wordvec = try_cache( load_dataset, (data_arg, args.wvpath, args.embedding_size), args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__) else: dm, volatile.wordvec = load_dataset(data_arg, args.wvpath, args.embedding_size) volatile.dm = dm param = Storage() param.args = args param.volatile = volatile model = Seq2seq(param) if args.mode == "train": model.train_process() elif args.mode == "test": test_res = model.test_process() for key, val in test_res.items(): if isinstance(val, bytes): test_res[key] = str(val) json.dump(test_res, open("./result.json", "w")) else: raise ValueError("Unknown mode")
def main(args, load_exclude_set, restoreCallback): logging.basicConfig(\ filename=0,\ level=logging.DEBUG,\ format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\ datefmt='%H:%M:%S') if args.debug: debug() logging.info(json.dumps(args, indent=2)) cuda_init(0, args.cuda) volatile = Storage() volatile.load_exclude_set = load_exclude_set volatile.restoreCallback = restoreCallback data_class = SingleTurnDialog.load_class(args.dataset) data_arg = Storage() data_arg.file_id = args.datapath + "#OpenSubtitles" data_arg.tokenizer = PretrainedTokenizer( BertTokenizer.from_pretrained(args.bert_vocab)) data_arg.pretrained = "bert" wordvec_class = WordVector.load_class(args.wvclass) if wordvec_class is None: wordvec_class = Glove def load_dataset(data_arg, wvpath, embedding_size): wv = wordvec_class(wvpath) dm = data_class(**data_arg) return dm, wv.load_matrix(embedding_size, dm.frequent_vocab_list) if args.cache: dm, volatile.wordvec = try_cache( load_dataset, (data_arg, args.wvpath, args.embedding_size), args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__) else: dm, volatile.wordvec = load_dataset(data_arg, args.wvpath, args.embedding_size) volatile.dm = dm param = Storage() param.args = args param.volatile = volatile model = Seq2seq(param) if args.mode == "train": model.train_process() elif args.mode == "test": model.test_process() else: raise ValueError("Unknown mode")
def train_process(opt): opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model opt.batch_size = opt.b device = torch.device('cuda' if opt.cuda else 'cpu') data_class = SingleTurnDialog.load_class('OpenSubtitles') data_arg = Storage() data_arg.file_id = opt.datapath data_arg.min_vocab_times = 20 def load_dataset(data_arg, wvpath, embedding_size): dm = data_class(**data_arg) return dm opt.n_position = 100 dm = load_dataset(data_arg, None, opt.n_position) opt.n_src_vocab = dm.valid_vocab_len opt.n_trg_vocab = dm.valid_vocab_len opt.n_vocab_size = dm.valid_vocab_len opt.src_pad_idx = 0 opt.trg_pad_idx = 0 opt.pad_idx = 0 model = transformer_model(opt, device).to(device) n_steps = 0 optimizer_ = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09) if (opt.restore != None): checkpoint = torch.load(opt.restore) model.load_state_dict(checkpoint['net']) n_steps = checkpoint['n_steps'] optimizer_.load_state_dict(checkpoint['opt']) optimizer = ScheduledOptim(optimizer_, opt.lr, opt.d_model, opt.n_warmup_steps, n_steps) dl = cotk.dataloader.OpenSubtitles( opt.datapath, min_vocab_times=data_arg.min_vocab_times) train(model, dm, optimizer, device, opt, dl)
def main(args): logging.basicConfig(\ filename=0,\ level=logging.DEBUG,\ format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\ datefmt='%H:%M:%S') if args.debug: debug() logging.info(json.dumps(args, indent=2)) cuda_init(0, args.cuda) volatile = Storage() data_class = SingleTurnDialog.load_class(args.dataset) wordvec_class = WordVector.load_class(args.wvclass) if wordvec_class is None: wordvec_class = Glove if args.cache: dm = try_cache(data_class, (args.datapath, ), args.cache_dir) volatile.wordvec = try_cache(\ lambda wv, ez, vl: wordvec_class(wv).load(ez, vl), \ (args.wvpath, args.embedding_size, dm.vocab_list), args.cache_dir, wordvec_class.__name__) else: dm = data_class(args.datapath) wv = wordvec_class(args.wvpath) volatile.wordvec = wv.load(args.embedding_size, dm.vocab_list) volatile.dm = dm param = Storage() param.args = args param.volatile = volatile model = Seq2seq(param) if args.mode == "train": model.train_process() elif args.mode == "test": model.test_process() else: raise ValueError("Unknown mode")
def main(args): if args.debug: debug() if args.cuda: config = tf.ConfigProto() config.gpu_options.allow_growth = True else: config = tf.ConfigProto(device_count={'GPU': 0}) os.environ["CUDA_VISIBLE_DEVICES"] = "-1" data_class = SingleTurnDialog.load_class(args.dataset) wordvec_class = WordVector.load_class(args.wvclass) if wordvec_class == None: wordvec_class = Glove if args.cache: data = try_cache(data_class, (args.datapath, ), args.cache_dir) vocab = data.frequent_vocab_list embed = try_cache( lambda wv, ez, vl: wordvec_class(wv).load_matrix(ez, vl), (args.wvpath, args.embedding_size, vocab), args.cache_dir, wordvec_class.__name__) else: data = data_class(args.datapath) wv = wordvec_class(args.wvpath) vocab = data.frequent_vocab_list embed = wv.load_matrix(args.embedding_size, vocab) embed = np.array(embed, dtype=np.float32) with tf.Session(config=config) as sess: model = create_model(sess, data, args, embed) if args.mode == "train": model.train_process(sess, data, args) else: test_res = model.test_process(sess, data, args) for key, val in test_res.items(): if isinstance(val, bytes): test_res[key] = str(val) json.dump(test_res, open("./result.json", "w"))
def main(args, load_exclude_set, restoreCallback): logging.basicConfig( filename=0, level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s', datefmt='%H:%M:%S') if args.debug: debug() logging.info(json.dumps(args, indent=2)) cuda_init(args.device, args.cuda) volatile = Storage() volatile.load_exclude_set = load_exclude_set volatile.restoreCallback = restoreCallback data_class = SingleTurnDialog.load_class(args.dataset) data_arg = Storage() data_arg.file_id = args.datapath # RAML parameters if args.model == "raml": data_arg.raml_file = "samples_iwslt14.txt" data_arg.num_samples = 10 or args.n_samples data_arg.tau = 0.4 wordvec_class = WordVector.load_class(args.wvclass) def load_dataset(data_arg, wvpath, embedding_size): wv = wordvec_class(wvpath) dm = data_class(**data_arg) return dm, wv.load_matrix(embedding_size, dm.vocab_list) if args.cache: dm, volatile.wordvec = try_cache( load_dataset, (data_arg, args.wvpath, args.embedding_size), args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__) else: dm, volatile.wordvec = load_dataset(data_arg, args.wvpath, args.embedding_size) volatile.dm = dm param = Storage() param.args = args param.volatile = volatile if args.model == "basic": model = Seq2seq(param) elif args.model == "raml": model = Seq2seqRAML(param) elif args.model == "scheduled-sampling": model = Seq2seqSS(param) elif args.model == "policy-gradient": model = Seq2seqPG(param) if args.mode == "train": model.train_process() elif args.mode == "test": test_res = model.test_process() json.dump(test_res, open("./result.json", "w")) else: raise ValueError("Unknown mode")
def base_test_init(self, dl): with pytest.raises(ValueError): SingleTurnDialog( "./tests/dataloader/dummy_opensubtitles#OpenSubtitles", pretrained='none') with pytest.raises(ValueError): SingleTurnDialog( "./tests/dataloader/dummy_opensubtitles#OpenSubtitles", pretrained='gpt2') with pytest.raises(ValueError): SingleTurnDialog( "./tests/dataloader/dummy_opensubtitles#OpenSubtitles", pretrained='bert') assert isinstance(dl, SingleTurnDialog) assert isinstance(dl.file_id, str) assert isinstance(dl.file_path, str) for set_name, fields in dl.fields.items(): assert isinstance(set_name, str) assert isinstance(fields, dict) for field_name, field in fields.items(): assert isinstance(field_name, str) assert isinstance(field, Field) assert isinstance(dl.vocabs, list) for vocab in dl.vocabs: assert isinstance(vocab, Vocab) assert isinstance(dl.tokenizers, list) for toker in dl.tokenizers: assert isinstance(toker, Tokenizer) for (_, data), (_, index) in zip(dl.data.items(), dl.index.items()): assert isinstance(data, dict) assert isinstance(index, list) for field_name, content in data.items(): assert isinstance(content, dict) for _, each_content in content.items(): assert isinstance(each_content, list) assert len(index) == len(each_content) for _, batch_id in dl.batch_id.items(): assert batch_id == 0 for _, batch_size in dl.batch_size.items(): assert batch_size is None assert isinstance(dl.frequent_vocab_list, list) assert dl.frequent_vocab_size == len(dl.frequent_vocab_list) assert isinstance(dl.all_vocab_list, list) assert dl.all_vocab_size == len(dl.all_vocab_list) assert dl.all_vocab_size >= dl.frequent_vocab_size for _, data in dl.data.items(): post = data['post'] post_ids = post['id'] assert isinstance(post_ids, list) assert isinstance(post_ids[0], list) if dl._pretrained is None or dl._pretrained == "gpt2": assert post_ids[0][0] == dl.go_id assert post_ids[0][-1] == dl.eos_id else: # dl._pretrained == "bert" assert post_ids[0][0] == dl.get_special_tokens_id("cls") assert post_ids[0][-1] == dl.get_special_tokens_id("sep") post_strs = post['str'] assert isinstance(post_strs, list) assert isinstance(post_strs[0], str) resp = data['resp'] resp_ids = resp['id'] assert isinstance(resp_ids, list) assert isinstance(resp_ids[0], list) if dl._pretrained is None or dl._pretrained == "gpt2": assert post_ids[0][0] == dl.go_id assert post_ids[0][-1] == dl.eos_id else: # dl._pretrained == "bert" assert post_ids[0][0] == dl.get_special_tokens_id("cls") assert post_ids[0][-1] == dl.get_special_tokens_id("sep") resp_strs = resp['str'] assert isinstance(resp_strs, list) assert isinstance(resp_strs[0], str) assert len(post) == len(resp) with pytest.raises(TypeError): SingleTurnDialog()