def main(args=None): args = utils.parse_args(create_parser(), args) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) shell = utils.ShellUtils() if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*.json"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell.mkdir(save_dir, silent=True) logger = logging.getLogger("evaluate") utils.seed(args.seed) logger.info("loading data...") data_dir = pathlib.Path(args.data_dir) data = { split: list(map(Dialog.from_json, utils.load_json(data_dir.joinpath(f"{split}.json")))) for split in (set(args.eval_splits) | {"train"}) } processor: DialogProcessor = utils.load_pickle(args.processor_path) logger.info("preparing model...") torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() model.load_state_dict(torch.load(args.ckpt_path)) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) for split in args.eval_splits: dialogs = data[split] logger.info(f"running evaluation on '{split}' split...") eval_args = EvaluateArugments( model=model, train_data=tuple(data["train"]), test_data=tuple(dialogs), processor=processor, embed_type=args.embed_type, embed_path=args.embed_path, device=device, batch_size=args.batch_size, beam_size=args.beam_size, max_conv_len=args.max_conv_len, max_sent_len=args.max_sent_len ) utils.save_json(eval_args.to_json(), save_dir.joinpath(f"eval-{split}-args.json")) with torch.no_grad(): results = evaluate(eval_args) save_path = save_dir.joinpath(f"eval-{split}.json") logger.info(f"'{split}' results saved to {save_path}") utils.save_json(results, save_path) logger.info("done!")
def save(self, data: Mapping[str, Sequence[Dialog]], path: pathlib.Path, overwrite: bool = False): if ((path.is_file() and path.exists() or path.is_dir() and utils.has_element(path.glob("*"))) and not overwrite): raise FileExistsError(f"file exists or directory is " f"not empty: {path}") shell = utils.ShellUtils() shell.remove(path, recursive=True, silent=True) return self.save_imp(data, path)
def main(): args = utils.parse_args(create_parser()) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*.json"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("generate") utils.seed(args.seed) logger.info("loading data...") processor = utils.load_pickle(args.processor_path) data = None if args.data_path is not None: data = list(map(Dialog.from_json, utils.load_json(args.data_path))) logger.info("preparing model...") torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() ckpt = torch.load(args.ckpt_path) model.load_state_dict(ckpt) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) gen_args = GenerateArguments( model=model, processor=processor, data=data, instances=args.instances, batch_size=args.batch_size, conv_scale=args.conv_scale, spkr_scale=args.spkr_scale, goal_scale=args.goal_scale, state_scale=args.state_scale, sent_scale=args.sent_scale, validate_dst=args.validate_dst, validate_unique=args.validate_unique, device=device ) utils.save_json(gen_args.to_json(), save_dir.joinpath("args.json")) with torch.no_grad(): samples = generate(gen_args) utils.save_json([sample.output.to_json() for sample in samples], save_dir.joinpath("gen-out.json")) utils.save_json([sample.input.to_json() for sample in samples], save_dir.joinpath("gen-in.json")) utils.save_lines([str(sample.log_prob) for sample in samples], save_dir.joinpath("logprob.txt")) logger.info("done!")
def __post_init__(self): self._logger = logging.getLogger(self.__class__.__name__) self._user_tensor = self.processor.tensorize_state_vocab( speaker="user", # tensorizer=self.processor.tensorize_turn_label_asv ) self._user_tensor = self._user_tensor.to(self.device) self._wizard_tensor = self.processor.tensorize_state_vocab( speaker="wizard" ) self._wizard_tensor = self._wizard_tensor.to(self.device) self._bce = nn.BCEWithLogitsLoss(reduction="none") utils.ShellUtils().mkdir(self.save_dir, True) assert self.asr_sigmoid_sum_order in {"sigmoid-sum", "sum-sigmoid"}
def main(args): if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) logger = logging.getLogger("GladRunner") save_dir = pathlib.Path(args.save_dir) shell = utils.ShellUtils() shell.mkdir(save_dir, silent=True) logger.info(f"running glad...") if args.data_format == "woz": load_fn = load_woz elif args.data_format == "json": load_fn = utils.chain_func( lambda x: list(map(Dialog.from_json, x)), utils.load_json ) elif args.data_format == "dstc": load_fn = load_dstc2 else: raise ValueError(f"unsupported data type: {args.data_type}") data_dir = pathlib.Path(args.data_dir) train_data = load_fn(str(data_dir.joinpath("train.json"))) valid_data = load_fn(str(try_path(data_dir.joinpath("valid.json")) or data_dir.joinpath("dev.json"))) test_data = load_fn(str(data_dir.joinpath("test.json"))) glad = Glad( train_data=train_data, valid_data=valid_data, test_data=test_data, glad_dir=args.glad_dir, epochs=args.max_epochs, batch_size=args.batch_size, emb_dropout=args.emb_dropout, local_dropout=args.local_dropout, global_dropout=args.global_dropout, save_dir=save_dir.joinpath("exp"), seed=args.seed, early_stop_criterion=args.early_stop_criterion, gpu=args.gpu ) pred, res = glad.run_all() logger.info("saving results...") utils.save_json(pred, str(save_dir.joinpath("pred.json"))) utils.save_json(res, str(save_dir.joinpath("eval.json"))) logger.info("done!")
def multi_glad(args: MultiGladArguments) -> dict: save_dir = args.save_dir shell = utils.ShellUtils() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("MultiGlad") engine = inflect.engine() for i in range(1, args.runs + 1): logger.info(f"[{i:02d}] running glad {engine.ordinal(i)} time...") trial_dir = save_dir.joinpath(f"trial-{i:02d}") shell.mkdir(trial_dir, silent=True) glad = Glad( train_data=args.train_data, valid_data=args.valid_data, test_data=args.test_data, glad_dir=args.glad_dir, save_dir=trial_dir.joinpath("exp"), epochs=args.max_epochs, batch_size=args.batch_size, emb_dropout=args.emb_dropout, local_dropout=args.local_dropout, global_dropout=args.global_dropout, early_stop_criterion=args.early_stop_criterion, gpu=args.gpu ) pred, res = glad.run_all() logger.info(f"[{i:02d}] saving results...") utils.save_json(pred, str(trial_dir.joinpath("pred.json"))) utils.save_json(res, str(trial_dir.joinpath("eval.json"))) logger.info(f"[{i:02d}] {engine.ordinal(i)} glad run finished.") logger.info(f"aggregating results...") result_paths = (str(save_dir.joinpath(f"trial-{i:02d}/eval.json")) for i in range(1, args.runs + 1)) results = list(map(utils.load_yaml, result_paths)) summary = reduce_json(results) utils.save_json(summary, str(save_dir.joinpath("eval-summary.json"))) utils.save_json(args.to_json(), save_dir.joinpath("args.json")) logger.info(f"done!") return summary
def optimize(trial: optuna.Trial, model_path, config_path): optimizer = Optimizer(trial) run_config = utils.load_yaml(config_path) mdl_config = utils.load_yaml(model_path) run_config = optimizer.optimize_config(run_config) mdl_config = optimizer.optimize_model(mdl_config) shell = utils.ShellUtils() shell.mkdir("optimize-debug", silent=True) utils.save_yaml(mdl_config, "optimize-debug/model.yml") utils.save_json(run_config, "optimize-debug/run.json") run_path, mdl_path = tempfile.mktemp(), tempfile.mktemp() timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") save_dir = (pathlib.Path(__file__).absolute().parent.joinpath( f"out/woz/{timestamp}")) run_config["save-dir"] = str(save_dir) run_config["model-path"] = mdl_path utils.save_json(run_config, run_path) utils.save_json(mdl_config, mdl_path) retcode, stdout, stderr = utils.Process( args=f"python run.py @load {run_path}".split(), cwd=pathlib.Path(__file__).absolute().parent, print_stdout=True, print_stderr=True).run() if retcode: raise RuntimeError(f"process 'run.py' failed; " f"return code: {retcode}; stderr: {stderr}") shell.remove(run_path, silent=True) shell.remove(mdl_path, silent=True) gen_dirs = list(save_dir.glob("gen-*")) if not gen_dirs: raise RuntimeError(f"no generation directory detected") if len(gen_dirs) > 1: warnings.warn(f"more than 1 generation " f"directories detected: {gen_dirs}") gen_dir = gen_dirs[-1] ttest_results = utils.load_json(gen_dir.joinpath("ttest-results.json")) return -ttest_results["hmean"]["t"]
def main(): args = utils.parse_args(create_parser()) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*.json"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() engine = inflect.engine() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("train") utils.seed(args.seed) logger.info("loading data...") load_fn = utils.chain_func(lambda x: list(map(Dialog.from_json, x)), utils.load_json) data_dir = pathlib.Path(args.data_dir) train_data = load_fn(str(data_dir.joinpath("train.json"))) valid_data = load_fn(str(data_dir.joinpath("dev.json"))) test_data = load_fn(str(data_dir.joinpath("test.json"))) processor = datasets.DialogProcessor(sent_processor=datasets.SentProcessor( bos=True, eos=True, lowercase=True, tokenizer="space", max_len=30), boc=True, eoc=True, state_order="randomized", max_len=30) processor.prepare_vocabs( list(itertools.chain(train_data, valid_data, test_data))) utils.save_pickle(processor, save_dir.joinpath("processor.pkl")) logger.info("preparing model...") utils.save_json(utils.load_yaml(args.gen_model_path), save_dir.joinpath("model.json")) torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.gen_model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() utils.report_model(logger, model) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) def create_scheduler(s): return utils.PiecewiseScheduler( [utils.Coordinate(*t) for t in eval(s)]) save_dir = pathlib.Path(args.save_dir) train_args = train.TrainArguments( model=model, train_data=tuple(train_data), valid_data=tuple(valid_data), processor=processor, device=device, save_dir=save_dir, report_every=args.report_every, batch_size=args.batch_size, valid_batch_size=args.valid_batch_size, optimizer=args.optimizer, gradient_clip=args.gradient_clip, l2norm_weight=args.l2norm_weight, learning_rate=args.learning_rate, num_epochs=args.epochs, kld_schedule=(utils.ConstantScheduler(1.0) if args.kld_schedule is None else create_scheduler(args.kld_schedule)), dropout_schedule=(utils.ConstantScheduler(1.0) if args.dropout_schedule is None else create_scheduler(args.dropout_schedule)), validate_every=args.validate_every, early_stop=args.early_stop, early_stop_criterion=args.early_stop_criterion, early_stop_patience=args.early_stop_patience, disable_kl=args.disable_kl, kl_mode=args.kl_mode) utils.save_json(train_args.to_json(), save_dir.joinpath("train-args.json")) record = train.train(train_args) utils.save_json(record.to_json(), save_dir.joinpath("final-summary.json")) eval_dir = save_dir.joinpath("eval") shell.mkdir(eval_dir, silent=True) eval_data = dict( list( filter(None, [ ("train", train_data) if "train" in args.eval_splits else None, ("dev", valid_data) if "dev" in args.eval_splits else None, ("test", test_data) if "test" in args.eval_splits else None ]))) for split, data in eval_data.items(): eval_args = evaluate.EvaluateArugments( model=model, train_data=tuple(train_data), test_data=tuple(data), processor=processor, embed_type=args.embed_type, embed_path=args.embed_path, device=device, batch_size=args.valid_batch_size, beam_size=args.beam_size, max_conv_len=args.max_conv_len, max_sent_len=args.max_sent_len) utils.save_json(eval_args.to_json(), eval_dir.joinpath(f"eval-{split}-args.json")) eval_results = evaluate.evaluate(eval_args) save_path = eval_dir.joinpath(f"eval-{split}.json") utils.save_json(eval_results, save_path) logger.info(f"'{split}' results saved to {save_path}") logger.info(f"will run {args.gen_runs} generation trials...") gen_summary = [] dst_summary = [] for gen_idx in range(1, args.gen_runs + 1): logger.info(f"running {engine.ordinal(gen_idx)} generation trial...") gen_dir = save_dir.joinpath(f"gen-{gen_idx:03d}") shell.mkdir(gen_dir, silent=True) gen_args = generate.GenerateArguments( model=model, processor=processor, data=train_data, instances=int(round(len(train_data) * args.multiplier)), batch_size=args.valid_batch_size, conv_scale=args.conv_scale, spkr_scale=args.spkr_scale, goal_scale=args.goal_scale, state_scale=args.state_scale, sent_scale=args.sent_scale, validate_dst=True, validate_unique=args.validate_unique, device=device) utils.save_json(gen_args.to_json(), gen_dir.joinpath("gen-args.json")) with torch.no_grad(): samples = generate.generate(gen_args) utils.save_json([sample.output.to_json() for sample in samples], gen_dir.joinpath("gen-out.json")) utils.save_json([sample.input.to_json() for sample in samples], gen_dir.joinpath("gen-in.json")) utils.save_lines([str(sample.log_prob) for sample in samples], gen_dir.joinpath("logprob.txt")) da_data = [sample.output for sample in samples] data = {"train": train_data, "dev": valid_data, "test": test_data} data["train"] += da_data # convert dialogs to dst dialogs data = { split: list(map(datasets.DSTDialog.from_dialog, dialogs)) for split, dialogs in data.items() } for split, dialogs in data.items(): logger.info(f"verifying '{split}' dataset...") for dialog in dialogs: dialog.compute_user_goals() dialog.validate() logger.info("preparing dst environment...") dst_processor = dst_datasets.DSTDialogProcessor( sent_processor=datasets.SentProcessor( bos=True, eos=True, lowercase=True, max_len=30)) dst_processor.prepare_vocabs(list(itertools.chain(*data.values()))) train_dataset = dst_datasets.DSTDialogDataset(dialogs=data["train"], processor=dst_processor) train_dataloader = dst_datasets.create_dataloader( train_dataset, batch_size=args.dst_batch_size, shuffle=True, pin_memory=True) dev_dataloader = dst_run.TestDataloader( dialogs=data["dev"], processor=dst_processor, max_batch_size=args.dst_batch_size) test_dataloader = dst_run.TestDataloader( dialogs=data["test"], processor=dst_processor, max_batch_size=args.dst_batch_size) logger.info("saving dst processor object...") utils.save_pickle(dst_processor, gen_dir.joinpath("processor.pkl")) torchmodels.register_packages(dst_models) dst_model_cls = torchmodels.create_model_cls(dst_pkg, args.dst_model_path) dst_model = dst_model_cls(dst_processor.vocabs) dst_model = dst_model.to(device) logger.info(str(model)) logger.info(f"number of parameters DST: " f"{utils.count_parameters(dst_model):,d}") logger.info(f"running {args.dst_runs} trials...") all_results = [] for idx in range(1, args.dst_runs + 1): logger.info(f"running {engine.ordinal(idx)} dst trial...") trial_dir = gen_dir.joinpath(f"dst-{idx:03d}") logger.info("resetting parameters...") dst_model.reset_parameters() logger.info("preparing trainer...") runner = dst_run.Runner( model=dst_model, processor=dst_processor, device=device, save_dir=trial_dir, epochs=int(round(args.dst_epochs / (1 + args.multiplier))), loss="sum", l2norm=args.dst_l2norm, gradient_clip=args.dst_gradient_clip, train_validate=False, early_stop=True, early_stop_criterion="joint-goal", early_stop_patience=None, asr_method="scaled", asr_sigmoid_sum_order="sigmoid-sum", asr_topk=5) logger.info("commencing training...") record = runner.train(train_dataloader=train_dataloader, dev_dataloader=dev_dataloader, test_fn=None) logger.info("final summary: ") logger.info(pprint.pformat(record.to_json())) utils.save_json(record.to_json(), trial_dir.joinpath("summary.json")) if not args.dst_test_asr: logger.info("commencing testing...") with torch.no_grad(): eval_results = runner.test(test_dataloader) logger.info("test results: ") logger.info(pprint.pformat(eval_results)) else: logger.info("commencing testing (asr)...") with torch.no_grad(): eval_results = runner.test_asr(test_dataloader) logger.info("test(asr) results: ") logger.info(pprint.pformat(eval_results)) eval_results["epoch"] = int(record.epoch) logger.info("test evaluation: ") logger.info(pprint.pformat(eval_results)) utils.save_json(eval_results, trial_dir.joinpath("eval.json")) all_results.append(eval_results) dst_summary.append(eval_results) logger.info("aggregating results...") summary = reduce_json(all_results) logger.info("aggregated results: ") agg_summary = pprint.pformat( {k: v["stats"]["mean"] for k, v in summary.items()}) logger.info(pprint.pformat(agg_summary)) gen_summary.append(agg_summary) utils.save_json(summary, gen_dir.joinpath("summary.json")) gen_summary = reduce_json(gen_summary) dst_summary = reduce_json(dst_summary) logger.info(f"aggregating generation trials ({args.gen_runs})...") logger.info( pprint.pformat({k: v["stats"]["mean"] for k, v in gen_summary.items()})) logger.info(f"aggregating dst trials ({args.gen_runs * args.dst_runs})...") logger.info( pprint.pformat({k: v["stats"]["mean"] for k, v in dst_summary.items()})) utils.save_json(gen_summary, save_dir.joinpath("gen-summary.json")) utils.save_json(dst_summary, save_dir.joinpath("dst-summary.json")) logger.info("done!")
def main(): args = utils.parse_args(create_parser()) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*.json"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("train") utils.seed(args.seed) logger.info("loading data...") load_fn = utils.chain_func(lambda x: list(map(Dialog.from_json, x)), utils.load_json) data_dir = pathlib.Path(args.data_dir) train_data = load_fn(str(data_dir.joinpath("train.json"))) valid_data = load_fn(str(data_dir.joinpath("dev.json"))) test_data = load_fn(str(data_dir.joinpath("test.json"))) processor = datasets.DialogProcessor( sent_processor=datasets.SentProcessor( bos=True, eos=True, lowercase=True, tokenizer="space", max_len=30 ), boc=True, eoc=True, state_order="randomized", max_len=30 ) processor.prepare_vocabs( list(itertools.chain(train_data, valid_data, test_data))) utils.save_pickle(processor, save_dir.joinpath("processor.pkl")) logger.info("preparing model...") utils.save_json(utils.load_yaml(args.model_path), save_dir.joinpath("model.json")) torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() utils.report_model(logger, model) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) def create_scheduler(s): return utils.PiecewiseScheduler([utils.Coordinate(*t) for t in eval(s)]) save_dir = pathlib.Path(args.save_dir) train_args = train.TrainArguments( model=model, train_data=tuple(train_data), valid_data=tuple(valid_data), processor=processor, device=device, save_dir=save_dir, report_every=args.report_every, batch_size=args.batch_size, valid_batch_size=args.valid_batch_size, optimizer=args.optimizer, gradient_clip=args.gradient_clip, l2norm_weight=args.l2norm_weight, learning_rate=args.learning_rate, num_epochs=args.epochs, kld_schedule=(utils.ConstantScheduler(1.0) if args.kld_schedule is None else create_scheduler(args.kld_schedule)), dropout_schedule=(utils.ConstantScheduler(1.0) if args.dropout_schedule is None else create_scheduler(args.dropout_schedule)), validate_every=args.validate_every, early_stop=args.early_stop, early_stop_criterion=args.early_stop_criterion, early_stop_patience=args.early_stop_patience, disable_kl=args.disable_kl, kl_mode=args.kl_mode ) utils.save_json(train_args.to_json(), save_dir.joinpath("train-args.json")) record = train.train(train_args) utils.save_json(record.to_json(), save_dir.joinpath("final-summary.json")) eval_dir = save_dir.joinpath("eval") shell.mkdir(eval_dir, silent=True) eval_data = dict(list(filter(None, [ ("train", train_data) if "train" in args.eval_splits else None, ("dev", valid_data) if "dev" in args.eval_splits else None, ("test", test_data) if "test" in args.eval_splits else None ]))) for split, data in eval_data.items(): eval_args = evaluate.EvaluateArugments( model=model, train_data=tuple(train_data), test_data=tuple(data), processor=processor, embed_type=args.embed_type, embed_path=args.embed_path, device=device, batch_size=args.valid_batch_size, beam_size=args.beam_size, max_conv_len=args.max_conv_len, max_sent_len=args.max_sent_len ) utils.save_json(eval_args.to_json(), eval_dir.joinpath(f"eval-{split}-args.json")) with torch.no_grad(): eval_results = evaluate.evaluate(eval_args) save_path = eval_dir.joinpath(f"eval-{split}.json") utils.save_json(eval_results, save_path) logger.info(f"'{split}' results saved to {save_path}") logger.info("done!")
def main(args=None): args = utils.parse_args(create_parser(), args) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() engine = inflect.engine() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("gda") utils.seed(args.seed) logger.info("loading data...") load_fn = utils.chain_func(lambda data: list(map(Dialog.from_json, data)), utils.load_json) processor = utils.load_pickle(args.processor_path) data_dir = pathlib.Path(args.data_dir) train_data = load_fn(str(data_dir.joinpath("train.json"))) valid_data = load_fn(str(data_dir.joinpath("dev.json"))) test_data = load_fn(str(data_dir.joinpath("test.json"))) data = {"train": train_data, "dev": valid_data, "test": test_data} logger.info("preparing model...") torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.gen_model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() ckpt = torch.load(args.ckpt_path) model.load_state_dict(ckpt) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) logger.info(f"will run {args.gen_runs} generation trials...") gen_summary = [] dst_summary = [] for gen_idx in range(1, args.gen_runs + 1): logger.info(f"running {engine.ordinal(gen_idx)} generation trial...") gen_dir = save_dir.joinpath(f"gen-{gen_idx:03d}") shell.mkdir(gen_dir, silent=True) gen_args = generate.GenerateArguments( model=model, processor=processor, data=tuple(train_data), instances=int(round(len(train_data) * args.multiplier)), batch_size=args.gen_batch_size, conv_scale=args.conv_scale, spkr_scale=args.spkr_scale, goal_scale=args.goal_scale, state_scale=args.state_scale, sent_scale=args.sent_scale, validate_dst=True, validate_unique=args.validate_unique, device=device) utils.save_json(gen_args.to_json(), gen_dir.joinpath("args.json")) with torch.no_grad(): samples = generate.generate(gen_args) utils.save_json([sample.output.to_json() for sample in samples], gen_dir.joinpath("out.json")) utils.save_json([sample.input.to_json() for sample in samples], gen_dir.joinpath("in.json")) utils.save_lines([str(sample.log_prob) for sample in samples], gen_dir.joinpath("logprob.txt")) da_data = [sample.output for sample in samples] gen_data = { "train": data["train"] + da_data, "dev": data["dev"], "test": data["test"] } # convert dialogs to dst dialogs gen_data = { split: list(map(datasets.DSTDialog.from_dialog, dialogs)) for split, dialogs in gen_data.items() } for split, dialogs in gen_data.items(): logger.info(f"verifying '{split}' dataset...") for dialog in dialogs: dialog.compute_user_goals() dialog.validate() logger.info("preparing dst environment...") dst_processor = dst_datasets.DSTDialogProcessor( sent_processor=datasets.SentProcessor( bos=True, eos=True, lowercase=True, max_len=30)) dst_processor.prepare_vocabs(list(itertools.chain(*gen_data.values()))) train_dataset = dst_datasets.DSTDialogDataset( dialogs=gen_data["train"], processor=dst_processor) train_dataloader = dst_datasets.create_dataloader( train_dataset, batch_size=args.dst_batch_size, shuffle=True, pin_memory=True) dev_dataloader = dst_run.TestDataloader( dialogs=gen_data["dev"], processor=dst_processor, max_batch_size=args.dst_batch_size) test_dataloader = dst_run.TestDataloader( dialogs=gen_data["test"], processor=dst_processor, max_batch_size=args.dst_batch_size) logger.info("saving dst processor object...") utils.save_pickle(dst_processor, gen_dir.joinpath("processor.pkl")) torchmodels.register_packages(dst_models) dst_model_cls = torchmodels.create_model_cls(dst_pkg, args.dst_model_path) dst_model = dst_model_cls(dst_processor.vocabs) dst_model = dst_model.to(device) logger.info(str(model)) logger.info(f"number of parameters DST: " f"{utils.count_parameters(dst_model):,d}") logger.info(f"will run {args.dst_runs} trials...") all_results = [] for idx in range(1, args.dst_runs + 1): logger.info(f"running {engine.ordinal(idx)} dst trial...") trial_dir = gen_dir.joinpath(f"dst-{idx:03d}") logger.info("resetting parameters...") dst_model.reset_parameters() logger.info("preparing trainer...") runner = dst_run.Runner( model=dst_model, processor=dst_processor, device=device, save_dir=trial_dir, epochs=int(round(args.epochs / (1 + args.multiplier))), loss="sum", l2norm=args.l2norm, gradient_clip=args.gradient_clip, train_validate=False, early_stop=True, early_stop_criterion="joint-goal", early_stop_patience=None, asr_method="scaled", asr_sigmoid_sum_order="sigmoid-sum", asr_topk=5) logger.info("commencing training...") record = runner.train(train_dataloader=train_dataloader, dev_dataloader=dev_dataloader, test_fn=None) logger.info("final summary: ") logger.info(pprint.pformat(record.to_json())) utils.save_json(record.to_json(), trial_dir.joinpath("summary.json")) if not args.test_asr: logger.info("commencing testing...") with torch.no_grad(): eval_results = runner.test(test_dataloader) logger.info("test results: ") logger.info(pprint.pformat(eval_results)) else: logger.info("commencing testing (asr)...") with torch.no_grad(): eval_results = runner.test_asr(test_dataloader) logger.info("test(asr) results: ") logger.info(pprint.pformat(eval_results)) eval_results["epoch"] = int(record.epoch) eval_results["criterion"] = record.value logger.info("test evaluation: ") logger.info(pprint.pformat(eval_results)) utils.save_json(eval_results, trial_dir.joinpath("eval.json")) all_results.append(eval_results) dst_summary.append(eval_results) logger.info("aggregating results...") summary = reduce_json(all_results) logger.info("aggregated results: ") agg_results = {k: v["stats"]["mean"] for k, v in summary.items()} gen_summary.append(agg_results) logger.info(pprint.pformat(agg_results)) utils.save_json(summary, gen_dir.joinpath("summary.json")) gen_summary = reduce_json(gen_summary) dst_summary = reduce_json(dst_summary) logger.info(f"aggregating generation trials ({args.gen_runs})...") logger.info( pprint.pformat({k: v["stats"]["mean"] for k, v in gen_summary.items()})) logger.info(f"aggregating dst trials ({args.gen_runs * args.dst_runs})...") logger.info( pprint.pformat({k: v["stats"]["mean"] for k, v in dst_summary.items()})) utils.save_json(gen_summary, save_dir.joinpath("gen-summary.json")) utils.save_json(dst_summary, save_dir.joinpath("dst-summary.json")) logger.info("done!")
def save_imp(self, dat: Mapping[str, Sequence[Dialog]], path: pathlib.Path): shell = utils.ShellUtils() shell.mkdir(path, True) for split in ("train", "dev", "test"): self.save_json(dat[split], path.joinpath(f"{split}.json"))
def main(args=None): args = utils.parse_args(create_parser(), args) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*.json"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("interpolate") data_dir = pathlib.Path(args.data_dir) data = { split: list( map(Dialog.from_json, utils.load_json(data_dir.joinpath(f"{split}.json")))) for split in set(args.splits) } processor: DialogProcessor = utils.load_pickle(args.processor_path) logger.info("preparing model...") torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() model.load_state_dict(torch.load(args.ckpt_path)) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) samples = (sample_data(data, args.anchor1), sample_data(data, args.anchor2)) formatter = utils.DialogTableFormatter() logger.info(f"first sample: \n{formatter.format(samples[0])}") logger.info(f"second sample: \n{formatter.format(samples[1])}") logger.info("preparing environment...") dataloader = datasets.create_dataloader(dataset=datasets.DialogDataset( data=samples, processor=processor), batch_size=1, shuffle=False, pin_memory=False) inferencer = InterpolateInferencer(model=model, processor=processor, device=device) logger.info("interpolating...") with torch.no_grad(): zconv_a, zconv_b = inferencer.encode(dataloader) zconv = torch.stack([ zconv_a + (zconv_b - zconv_a) / args.steps * i for i in range(args.steps + 1) ]) gen_samples = inferencer.generate(td.DataLoader(zconv, shuffle=False)) # use original data points for two extremes samples = [samples[0]] + list(gen_samples[1:-1]) + [samples[1]] logger.info("interpolation results: ") for i, sample in enumerate(samples): logger.info(f"interpolation step {i / args.steps:.2%}: \n" f"{formatter.format(sample)}") logger.info("saving results...") json_dir = save_dir.joinpath("json") json_dir.mkdir(exist_ok=True) for i, sample in enumerate(samples, 1): utils.save_json(sample.to_json(), json_dir.joinpath(f"{i:02d}.json")) tbl_dir = save_dir.joinpath("table") tbl_dir.mkdir(exist_ok=True) for i, sample in enumerate(samples, 1): utils.save_lines([formatter.format(sample)], tbl_dir.joinpath(f"{i:02d}.txt")) ltx_dir = save_dir.joinpath("latex") ltx_dir.mkdir(exist_ok=True) ltx_formatter = utils.DialogICMLLatexFormatter() for i, sample in enumerate(samples, 1): utils.save_lines([ltx_formatter.format(sample)], ltx_dir.joinpath(f"{i:02d}.tex")) logger.info("done!")
def train(args: TrainArguments) -> Record: model, device = args.model, args.device save_dir = args.save_dir shell = utils.ShellUtils() shell.mkdir(save_dir, silent=True) utils.save_json(args.to_json(), str(save_dir.joinpath("args.json"))) logger = logging.getLogger("train") processor = args.processor vocabs: datasets.VocabSet = processor.vocabs train_dataset = datasets.DialogDataset( data=args.train_data, processor=processor ) valid_dataset = datasets.DialogDataset( data=args.valid_data, processor=processor ) logger.info("preparing training environment...") loss = create_loss( model=model, vocabs=vocabs, kld_weight=args.kld_schedule, enable_kl=not args.disable_kl, kl_mode=args.kl_mode ) if args.optimizer == "adam": op_cls = op.Adam else: raise ValueError(f"unsupported optimizer: {args.optimizer}") fval_cls = FinegrainedValidator fval_kwg = dict( model=model, processor=processor, device=device, evaluators=list(filter(None, ( SpeakerEvaluator(vocabs.speaker), DialogStateEvaluator(vocabs), PosteriorEvaluator() ))), report_every=None, run_end_report=False, progress_stat="loss", loss=loss ) if isinstance(model, models.VHDA): @dataclass class VHDAValidator(VHDAInferencer, fval_cls): pass fval_cls = VHDAValidator fval_kwg.update(dict( sample_scale=1.0 )) fval = fval_cls(**fval_kwg) gval_cls = GenerativeValidator gval_kwg = dict( model=model, processor=processor, batch_size=args.valid_batch_size, device=device, evaluators=list(filter(None, [ DistinctEvaluator(vocabs), SentLengthEvaluator(vocabs), RougeEvaluator(vocabs), DialogLengthEvaluator(), WordEntropyEvaluator(train_dataset) ])), report_every=None, run_end_report=False, beam_size=args.beam_size, max_sent_len=args.max_gen_len ) gval = gval_cls(**gval_kwg) trainer_cls = Trainer trainer_kwargs = dict( model=model, processor=processor, device=device, writer=torch.utils.tensorboard.SummaryWriter( log_dir=str(args.save_dir) ), evaluators=list(filter(None, ( SpeakerEvaluator(vocabs.speaker), DialogStateEvaluator(vocabs) ))), progress_stat="loss", display_stats={"loss", "kld", "goal-acc-turn-user", "rouge-l-f1", "conv-mi", "nll", "conv-len"}, report_every=args.report_every, stats_formatter=utils.StatsFormatter(num_cols=3), dialog_formatter=utils.DialogTableFormatter( max_col_len=50 ), loss=loss, optimizer_cls=functools.partial( op_cls, lr=args.learning_rate ), grad_clip=args.gradient_clip, l2norm=args.l2norm_weight, save_dir=pathlib.Path(args.save_dir), num_epochs=args.num_epochs, fin_valid=fval, gen_valid=gval, validate_every=args.validate_every, early_stop=args.early_stop, early_stop_criterion=args.early_stop_criterion, early_stop_patience=args.early_stop_patience, save_every=args.save_every ) if isinstance(model, models.VHDA): @dataclass class VHDATrainer(VHDAInferencer, trainer_cls): pass trainer_cls = VHDATrainer trainer_kwargs.update(dict( dropout_scale=args.dropout_schedule )) trainer = trainer_cls(**trainer_kwargs) train_dataloader = datasets.create_dataloader( train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, drop_last=False ) valid_dataloader = datasets.create_dataloader( valid_dataset, batch_size=args.valid_batch_size, shuffle=False, pin_memory=True, drop_last=False ) logger.info("commencing training...") record = trainer.train(train_dataloader, valid_dataloader) logger.info(f"final summary: {pprint.pformat(record.to_short_json())}") logger.info("done!") return record