Ejemplo n.º 1
0
def main(args=None):
    args = utils.parse_args(create_parser(), args)
    if args.logging_config is not None:
        logging.config.dictConfig(utils.load_yaml(args.logging_config))
    save_dir = pathlib.Path(args.save_dir)
    shell = utils.ShellUtils()
    if (not args.overwrite and
            save_dir.exists() and utils.has_element(save_dir.glob("*.json"))):
        raise FileExistsError(f"save directory ({save_dir}) is not empty")
    shell.mkdir(save_dir, silent=True)
    logger = logging.getLogger("evaluate")
    utils.seed(args.seed)
    logger.info("loading data...")
    data_dir = pathlib.Path(args.data_dir)
    data = {
        split: list(map(Dialog.from_json,
                        utils.load_json(data_dir.joinpath(f"{split}.json"))))
        for split in (set(args.eval_splits) | {"train"})
    }
    processor: DialogProcessor = utils.load_pickle(args.processor_path)
    logger.info("preparing model...")
    torchmodels.register_packages(models)
    model_cls = torchmodels.create_model_cls(models, args.model_path)
    model: models.AbstractTDA = model_cls(processor.vocabs)
    model.reset_parameters()
    model.load_state_dict(torch.load(args.ckpt_path))
    device = torch.device("cpu")
    if args.gpu is not None:
        device = torch.device(f"cuda:{args.gpu}")
    model = model.to(device)
    for split in args.eval_splits:
        dialogs = data[split]
        logger.info(f"running evaluation on '{split}' split...")
        eval_args = EvaluateArugments(
            model=model,
            train_data=tuple(data["train"]),
            test_data=tuple(dialogs),
            processor=processor,
            embed_type=args.embed_type,
            embed_path=args.embed_path,
            device=device,
            batch_size=args.batch_size,
            beam_size=args.beam_size,
            max_conv_len=args.max_conv_len,
            max_sent_len=args.max_sent_len
        )
        utils.save_json(eval_args.to_json(),
                        save_dir.joinpath(f"eval-{split}-args.json"))
        with torch.no_grad():
            results = evaluate(eval_args)
        save_path = save_dir.joinpath(f"eval-{split}.json")
        logger.info(f"'{split}' results saved to {save_path}")
        utils.save_json(results, save_path)
    logger.info("done!")
Ejemplo n.º 2
0
 def save(self,
          data: Mapping[str, Sequence[Dialog]],
          path: pathlib.Path,
          overwrite: bool = False):
     if ((path.is_file() and path.exists()
          or path.is_dir() and utils.has_element(path.glob("*")))
             and not overwrite):
         raise FileExistsError(f"file exists or directory is "
                               f"not empty: {path}")
     shell = utils.ShellUtils()
     shell.remove(path, recursive=True, silent=True)
     return self.save_imp(data, path)
Ejemplo n.º 3
0
def main():
    args = utils.parse_args(create_parser())
    if args.logging_config is not None:
        logging.config.dictConfig(utils.load_yaml(args.logging_config))
    save_dir = pathlib.Path(args.save_dir)
    if (not args.overwrite and
            save_dir.exists() and utils.has_element(save_dir.glob("*.json"))):
        raise FileExistsError(f"save directory ({save_dir}) is not empty")
    shell = utils.ShellUtils()
    shell.mkdir(save_dir, silent=True)
    logger = logging.getLogger("generate")
    utils.seed(args.seed)
    logger.info("loading data...")
    processor = utils.load_pickle(args.processor_path)
    data = None
    if args.data_path is not None:
        data = list(map(Dialog.from_json, utils.load_json(args.data_path)))
    logger.info("preparing model...")
    torchmodels.register_packages(models)
    model_cls = torchmodels.create_model_cls(models, args.model_path)
    model: models.AbstractTDA = model_cls(processor.vocabs)
    model.reset_parameters()
    ckpt = torch.load(args.ckpt_path)
    model.load_state_dict(ckpt)
    device = torch.device("cpu")
    if args.gpu is not None:
        device = torch.device(f"cuda:{args.gpu}")
    model = model.to(device)
    gen_args = GenerateArguments(
        model=model,
        processor=processor,
        data=data,
        instances=args.instances,
        batch_size=args.batch_size,
        conv_scale=args.conv_scale,
        spkr_scale=args.spkr_scale,
        goal_scale=args.goal_scale,
        state_scale=args.state_scale,
        sent_scale=args.sent_scale,
        validate_dst=args.validate_dst,
        validate_unique=args.validate_unique,
        device=device
    )
    utils.save_json(gen_args.to_json(), save_dir.joinpath("args.json"))
    with torch.no_grad():
        samples = generate(gen_args)
    utils.save_json([sample.output.to_json() for sample in samples],
                    save_dir.joinpath("gen-out.json"))
    utils.save_json([sample.input.to_json() for sample in samples],
                    save_dir.joinpath("gen-in.json"))
    utils.save_lines([str(sample.log_prob) for sample in samples],
                     save_dir.joinpath("logprob.txt"))
    logger.info("done!")
Ejemplo n.º 4
0
Archivo: run.py Proyecto: kaniblu/vhda
 def __post_init__(self):
     self._logger = logging.getLogger(self.__class__.__name__)
     self._user_tensor = self.processor.tensorize_state_vocab(
         speaker="user",
         # tensorizer=self.processor.tensorize_turn_label_asv
     )
     self._user_tensor = self._user_tensor.to(self.device)
     self._wizard_tensor = self.processor.tensorize_state_vocab(
         speaker="wizard"
     )
     self._wizard_tensor = self._wizard_tensor.to(self.device)
     self._bce = nn.BCEWithLogitsLoss(reduction="none")
     utils.ShellUtils().mkdir(self.save_dir, True)
     assert self.asr_sigmoid_sum_order in {"sigmoid-sum", "sum-sigmoid"}
Ejemplo n.º 5
0
Archivo: glad.py Proyecto: kaniblu/vhda
def main(args):
    if args.logging_config is not None:
        logging.config.dictConfig(utils.load_yaml(args.logging_config))
    logger = logging.getLogger("GladRunner")
    save_dir = pathlib.Path(args.save_dir)
    shell = utils.ShellUtils()
    shell.mkdir(save_dir, silent=True)
    logger.info(f"running glad...")
    if args.data_format == "woz":
        load_fn = load_woz
    elif args.data_format == "json":
        load_fn = utils.chain_func(
            lambda x: list(map(Dialog.from_json, x)),
            utils.load_json
        )
    elif args.data_format == "dstc":
        load_fn = load_dstc2
    else:
        raise ValueError(f"unsupported data type: {args.data_type}")
    data_dir = pathlib.Path(args.data_dir)
    train_data = load_fn(str(data_dir.joinpath("train.json")))
    valid_data = load_fn(str(try_path(data_dir.joinpath("valid.json")) or
                             data_dir.joinpath("dev.json")))
    test_data = load_fn(str(data_dir.joinpath("test.json")))
    glad = Glad(
        train_data=train_data,
        valid_data=valid_data,
        test_data=test_data,
        glad_dir=args.glad_dir,
        epochs=args.max_epochs,
        batch_size=args.batch_size,
        emb_dropout=args.emb_dropout,
        local_dropout=args.local_dropout,
        global_dropout=args.global_dropout,
        save_dir=save_dir.joinpath("exp"),
        seed=args.seed,
        early_stop_criterion=args.early_stop_criterion,
        gpu=args.gpu
    )
    pred, res = glad.run_all()
    logger.info("saving results...")
    utils.save_json(pred, str(save_dir.joinpath("pred.json")))
    utils.save_json(res, str(save_dir.joinpath("eval.json")))
    logger.info("done!")
Ejemplo n.º 6
0
def multi_glad(args: MultiGladArguments) -> dict:
    save_dir = args.save_dir
    shell = utils.ShellUtils()
    shell.mkdir(save_dir, silent=True)
    logger = logging.getLogger("MultiGlad")
    engine = inflect.engine()
    for i in range(1, args.runs + 1):
        logger.info(f"[{i:02d}] running glad {engine.ordinal(i)} time...")
        trial_dir = save_dir.joinpath(f"trial-{i:02d}")
        shell.mkdir(trial_dir, silent=True)
        glad = Glad(
            train_data=args.train_data,
            valid_data=args.valid_data,
            test_data=args.test_data,
            glad_dir=args.glad_dir,
            save_dir=trial_dir.joinpath("exp"),
            epochs=args.max_epochs,
            batch_size=args.batch_size,
            emb_dropout=args.emb_dropout,
            local_dropout=args.local_dropout,
            global_dropout=args.global_dropout,
            early_stop_criterion=args.early_stop_criterion,
            gpu=args.gpu
        )
        pred, res = glad.run_all()
        logger.info(f"[{i:02d}] saving results...")
        utils.save_json(pred, str(trial_dir.joinpath("pred.json")))
        utils.save_json(res, str(trial_dir.joinpath("eval.json")))
        logger.info(f"[{i:02d}] {engine.ordinal(i)} glad run finished.")
    logger.info(f"aggregating results...")
    result_paths = (str(save_dir.joinpath(f"trial-{i:02d}/eval.json"))
                    for i in range(1, args.runs + 1))
    results = list(map(utils.load_yaml, result_paths))
    summary = reduce_json(results)
    utils.save_json(summary, str(save_dir.joinpath("eval-summary.json")))
    utils.save_json(args.to_json(), save_dir.joinpath("args.json"))
    logger.info(f"done!")
    return summary
Ejemplo n.º 7
0
def optimize(trial: optuna.Trial, model_path, config_path):
    optimizer = Optimizer(trial)
    run_config = utils.load_yaml(config_path)
    mdl_config = utils.load_yaml(model_path)
    run_config = optimizer.optimize_config(run_config)
    mdl_config = optimizer.optimize_model(mdl_config)
    shell = utils.ShellUtils()
    shell.mkdir("optimize-debug", silent=True)
    utils.save_yaml(mdl_config, "optimize-debug/model.yml")
    utils.save_json(run_config, "optimize-debug/run.json")
    run_path, mdl_path = tempfile.mktemp(), tempfile.mktemp()
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    save_dir = (pathlib.Path(__file__).absolute().parent.joinpath(
        f"out/woz/{timestamp}"))
    run_config["save-dir"] = str(save_dir)
    run_config["model-path"] = mdl_path
    utils.save_json(run_config, run_path)
    utils.save_json(mdl_config, mdl_path)
    retcode, stdout, stderr = utils.Process(
        args=f"python run.py @load {run_path}".split(),
        cwd=pathlib.Path(__file__).absolute().parent,
        print_stdout=True,
        print_stderr=True).run()
    if retcode:
        raise RuntimeError(f"process 'run.py' failed; "
                           f"return code: {retcode}; stderr: {stderr}")
    shell.remove(run_path, silent=True)
    shell.remove(mdl_path, silent=True)
    gen_dirs = list(save_dir.glob("gen-*"))
    if not gen_dirs:
        raise RuntimeError(f"no generation directory detected")
    if len(gen_dirs) > 1:
        warnings.warn(f"more than 1 generation "
                      f"directories detected: {gen_dirs}")
    gen_dir = gen_dirs[-1]
    ttest_results = utils.load_json(gen_dir.joinpath("ttest-results.json"))
    return -ttest_results["hmean"]["t"]
Ejemplo n.º 8
0
def main():
    args = utils.parse_args(create_parser())
    if args.logging_config is not None:
        logging.config.dictConfig(utils.load_yaml(args.logging_config))
    save_dir = pathlib.Path(args.save_dir)
    if (not args.overwrite and save_dir.exists()
            and utils.has_element(save_dir.glob("*.json"))):
        raise FileExistsError(f"save directory ({save_dir}) is not empty")
    shell = utils.ShellUtils()
    engine = inflect.engine()
    shell.mkdir(save_dir, silent=True)
    logger = logging.getLogger("train")
    utils.seed(args.seed)
    logger.info("loading data...")
    load_fn = utils.chain_func(lambda x: list(map(Dialog.from_json, x)),
                               utils.load_json)
    data_dir = pathlib.Path(args.data_dir)
    train_data = load_fn(str(data_dir.joinpath("train.json")))
    valid_data = load_fn(str(data_dir.joinpath("dev.json")))
    test_data = load_fn(str(data_dir.joinpath("test.json")))
    processor = datasets.DialogProcessor(sent_processor=datasets.SentProcessor(
        bos=True, eos=True, lowercase=True, tokenizer="space", max_len=30),
                                         boc=True,
                                         eoc=True,
                                         state_order="randomized",
                                         max_len=30)
    processor.prepare_vocabs(
        list(itertools.chain(train_data, valid_data, test_data)))
    utils.save_pickle(processor, save_dir.joinpath("processor.pkl"))
    logger.info("preparing model...")
    utils.save_json(utils.load_yaml(args.gen_model_path),
                    save_dir.joinpath("model.json"))
    torchmodels.register_packages(models)
    model_cls = torchmodels.create_model_cls(models, args.gen_model_path)
    model: models.AbstractTDA = model_cls(processor.vocabs)
    model.reset_parameters()
    utils.report_model(logger, model)
    device = torch.device("cpu")
    if args.gpu is not None:
        device = torch.device(f"cuda:{args.gpu}")
    model = model.to(device)

    def create_scheduler(s):
        return utils.PiecewiseScheduler(
            [utils.Coordinate(*t) for t in eval(s)])

    save_dir = pathlib.Path(args.save_dir)
    train_args = train.TrainArguments(
        model=model,
        train_data=tuple(train_data),
        valid_data=tuple(valid_data),
        processor=processor,
        device=device,
        save_dir=save_dir,
        report_every=args.report_every,
        batch_size=args.batch_size,
        valid_batch_size=args.valid_batch_size,
        optimizer=args.optimizer,
        gradient_clip=args.gradient_clip,
        l2norm_weight=args.l2norm_weight,
        learning_rate=args.learning_rate,
        num_epochs=args.epochs,
        kld_schedule=(utils.ConstantScheduler(1.0) if args.kld_schedule is None
                      else create_scheduler(args.kld_schedule)),
        dropout_schedule=(utils.ConstantScheduler(1.0)
                          if args.dropout_schedule is None else
                          create_scheduler(args.dropout_schedule)),
        validate_every=args.validate_every,
        early_stop=args.early_stop,
        early_stop_criterion=args.early_stop_criterion,
        early_stop_patience=args.early_stop_patience,
        disable_kl=args.disable_kl,
        kl_mode=args.kl_mode)
    utils.save_json(train_args.to_json(), save_dir.joinpath("train-args.json"))
    record = train.train(train_args)
    utils.save_json(record.to_json(), save_dir.joinpath("final-summary.json"))
    eval_dir = save_dir.joinpath("eval")
    shell.mkdir(eval_dir, silent=True)
    eval_data = dict(
        list(
            filter(None, [
                ("train", train_data) if "train" in args.eval_splits else None,
                ("dev", valid_data) if "dev" in args.eval_splits else None,
                ("test", test_data) if "test" in args.eval_splits else None
            ])))
    for split, data in eval_data.items():
        eval_args = evaluate.EvaluateArugments(
            model=model,
            train_data=tuple(train_data),
            test_data=tuple(data),
            processor=processor,
            embed_type=args.embed_type,
            embed_path=args.embed_path,
            device=device,
            batch_size=args.valid_batch_size,
            beam_size=args.beam_size,
            max_conv_len=args.max_conv_len,
            max_sent_len=args.max_sent_len)
        utils.save_json(eval_args.to_json(),
                        eval_dir.joinpath(f"eval-{split}-args.json"))
        eval_results = evaluate.evaluate(eval_args)
        save_path = eval_dir.joinpath(f"eval-{split}.json")
        utils.save_json(eval_results, save_path)
        logger.info(f"'{split}' results saved to {save_path}")
    logger.info(f"will run {args.gen_runs} generation trials...")
    gen_summary = []
    dst_summary = []
    for gen_idx in range(1, args.gen_runs + 1):
        logger.info(f"running {engine.ordinal(gen_idx)} generation trial...")
        gen_dir = save_dir.joinpath(f"gen-{gen_idx:03d}")
        shell.mkdir(gen_dir, silent=True)
        gen_args = generate.GenerateArguments(
            model=model,
            processor=processor,
            data=train_data,
            instances=int(round(len(train_data) * args.multiplier)),
            batch_size=args.valid_batch_size,
            conv_scale=args.conv_scale,
            spkr_scale=args.spkr_scale,
            goal_scale=args.goal_scale,
            state_scale=args.state_scale,
            sent_scale=args.sent_scale,
            validate_dst=True,
            validate_unique=args.validate_unique,
            device=device)
        utils.save_json(gen_args.to_json(), gen_dir.joinpath("gen-args.json"))
        with torch.no_grad():
            samples = generate.generate(gen_args)
        utils.save_json([sample.output.to_json() for sample in samples],
                        gen_dir.joinpath("gen-out.json"))
        utils.save_json([sample.input.to_json() for sample in samples],
                        gen_dir.joinpath("gen-in.json"))
        utils.save_lines([str(sample.log_prob) for sample in samples],
                         gen_dir.joinpath("logprob.txt"))
        da_data = [sample.output for sample in samples]
        data = {"train": train_data, "dev": valid_data, "test": test_data}
        data["train"] += da_data
        # convert dialogs to dst dialogs
        data = {
            split: list(map(datasets.DSTDialog.from_dialog, dialogs))
            for split, dialogs in data.items()
        }
        for split, dialogs in data.items():
            logger.info(f"verifying '{split}' dataset...")
            for dialog in dialogs:
                dialog.compute_user_goals()
                dialog.validate()
        logger.info("preparing dst environment...")
        dst_processor = dst_datasets.DSTDialogProcessor(
            sent_processor=datasets.SentProcessor(
                bos=True, eos=True, lowercase=True, max_len=30))
        dst_processor.prepare_vocabs(list(itertools.chain(*data.values())))
        train_dataset = dst_datasets.DSTDialogDataset(dialogs=data["train"],
                                                      processor=dst_processor)
        train_dataloader = dst_datasets.create_dataloader(
            train_dataset,
            batch_size=args.dst_batch_size,
            shuffle=True,
            pin_memory=True)
        dev_dataloader = dst_run.TestDataloader(
            dialogs=data["dev"],
            processor=dst_processor,
            max_batch_size=args.dst_batch_size)
        test_dataloader = dst_run.TestDataloader(
            dialogs=data["test"],
            processor=dst_processor,
            max_batch_size=args.dst_batch_size)
        logger.info("saving dst processor object...")
        utils.save_pickle(dst_processor, gen_dir.joinpath("processor.pkl"))
        torchmodels.register_packages(dst_models)
        dst_model_cls = torchmodels.create_model_cls(dst_pkg,
                                                     args.dst_model_path)
        dst_model = dst_model_cls(dst_processor.vocabs)
        dst_model = dst_model.to(device)
        logger.info(str(model))
        logger.info(f"number of parameters DST: "
                    f"{utils.count_parameters(dst_model):,d}")
        logger.info(f"running {args.dst_runs} trials...")
        all_results = []
        for idx in range(1, args.dst_runs + 1):
            logger.info(f"running {engine.ordinal(idx)} dst trial...")
            trial_dir = gen_dir.joinpath(f"dst-{idx:03d}")
            logger.info("resetting parameters...")
            dst_model.reset_parameters()
            logger.info("preparing trainer...")
            runner = dst_run.Runner(
                model=dst_model,
                processor=dst_processor,
                device=device,
                save_dir=trial_dir,
                epochs=int(round(args.dst_epochs / (1 + args.multiplier))),
                loss="sum",
                l2norm=args.dst_l2norm,
                gradient_clip=args.dst_gradient_clip,
                train_validate=False,
                early_stop=True,
                early_stop_criterion="joint-goal",
                early_stop_patience=None,
                asr_method="scaled",
                asr_sigmoid_sum_order="sigmoid-sum",
                asr_topk=5)

            logger.info("commencing training...")
            record = runner.train(train_dataloader=train_dataloader,
                                  dev_dataloader=dev_dataloader,
                                  test_fn=None)
            logger.info("final summary: ")
            logger.info(pprint.pformat(record.to_json()))
            utils.save_json(record.to_json(),
                            trial_dir.joinpath("summary.json"))
            if not args.dst_test_asr:
                logger.info("commencing testing...")
                with torch.no_grad():
                    eval_results = runner.test(test_dataloader)
                logger.info("test results: ")
                logger.info(pprint.pformat(eval_results))
            else:
                logger.info("commencing testing (asr)...")
                with torch.no_grad():
                    eval_results = runner.test_asr(test_dataloader)
                logger.info("test(asr) results: ")
                logger.info(pprint.pformat(eval_results))
            eval_results["epoch"] = int(record.epoch)
            logger.info("test evaluation: ")
            logger.info(pprint.pformat(eval_results))
            utils.save_json(eval_results, trial_dir.joinpath("eval.json"))
            all_results.append(eval_results)
            dst_summary.append(eval_results)
        logger.info("aggregating results...")
        summary = reduce_json(all_results)
        logger.info("aggregated results: ")
        agg_summary = pprint.pformat(
            {k: v["stats"]["mean"]
             for k, v in summary.items()})
        logger.info(pprint.pformat(agg_summary))
        gen_summary.append(agg_summary)
        utils.save_json(summary, gen_dir.joinpath("summary.json"))
    gen_summary = reduce_json(gen_summary)
    dst_summary = reduce_json(dst_summary)
    logger.info(f"aggregating generation trials ({args.gen_runs})...")
    logger.info(
        pprint.pformat({k: v["stats"]["mean"]
                        for k, v in gen_summary.items()}))
    logger.info(f"aggregating dst trials ({args.gen_runs * args.dst_runs})...")
    logger.info(
        pprint.pformat({k: v["stats"]["mean"]
                        for k, v in dst_summary.items()}))
    utils.save_json(gen_summary, save_dir.joinpath("gen-summary.json"))
    utils.save_json(dst_summary, save_dir.joinpath("dst-summary.json"))
    logger.info("done!")
Ejemplo n.º 9
0
def main():
    args = utils.parse_args(create_parser())
    if args.logging_config is not None:
        logging.config.dictConfig(utils.load_yaml(args.logging_config))
    save_dir = pathlib.Path(args.save_dir)
    if (not args.overwrite and
            save_dir.exists() and utils.has_element(save_dir.glob("*.json"))):
        raise FileExistsError(f"save directory ({save_dir}) is not empty")
    shell = utils.ShellUtils()
    shell.mkdir(save_dir, silent=True)
    logger = logging.getLogger("train")
    utils.seed(args.seed)
    logger.info("loading data...")
    load_fn = utils.chain_func(lambda x: list(map(Dialog.from_json, x)),
                               utils.load_json)
    data_dir = pathlib.Path(args.data_dir)
    train_data = load_fn(str(data_dir.joinpath("train.json")))
    valid_data = load_fn(str(data_dir.joinpath("dev.json")))
    test_data = load_fn(str(data_dir.joinpath("test.json")))
    processor = datasets.DialogProcessor(
        sent_processor=datasets.SentProcessor(
            bos=True,
            eos=True,
            lowercase=True,
            tokenizer="space",
            max_len=30
        ),
        boc=True,
        eoc=True,
        state_order="randomized",
        max_len=30
    )
    processor.prepare_vocabs(
        list(itertools.chain(train_data, valid_data, test_data)))
    utils.save_pickle(processor, save_dir.joinpath("processor.pkl"))
    logger.info("preparing model...")
    utils.save_json(utils.load_yaml(args.model_path),
                    save_dir.joinpath("model.json"))
    torchmodels.register_packages(models)
    model_cls = torchmodels.create_model_cls(models, args.model_path)
    model: models.AbstractTDA = model_cls(processor.vocabs)
    model.reset_parameters()
    utils.report_model(logger, model)
    device = torch.device("cpu")
    if args.gpu is not None:
        device = torch.device(f"cuda:{args.gpu}")
    model = model.to(device)

    def create_scheduler(s):
        return utils.PiecewiseScheduler([utils.Coordinate(*t) for t in eval(s)])

    save_dir = pathlib.Path(args.save_dir)
    train_args = train.TrainArguments(
        model=model,
        train_data=tuple(train_data),
        valid_data=tuple(valid_data),
        processor=processor,
        device=device,
        save_dir=save_dir,
        report_every=args.report_every,
        batch_size=args.batch_size,
        valid_batch_size=args.valid_batch_size,
        optimizer=args.optimizer,
        gradient_clip=args.gradient_clip,
        l2norm_weight=args.l2norm_weight,
        learning_rate=args.learning_rate,
        num_epochs=args.epochs,
        kld_schedule=(utils.ConstantScheduler(1.0)
                      if args.kld_schedule is None else
                      create_scheduler(args.kld_schedule)),
        dropout_schedule=(utils.ConstantScheduler(1.0)
                          if args.dropout_schedule is None else
                          create_scheduler(args.dropout_schedule)),
        validate_every=args.validate_every,
        early_stop=args.early_stop,
        early_stop_criterion=args.early_stop_criterion,
        early_stop_patience=args.early_stop_patience,
        disable_kl=args.disable_kl,
        kl_mode=args.kl_mode
    )
    utils.save_json(train_args.to_json(), save_dir.joinpath("train-args.json"))
    record = train.train(train_args)
    utils.save_json(record.to_json(), save_dir.joinpath("final-summary.json"))
    eval_dir = save_dir.joinpath("eval")
    shell.mkdir(eval_dir, silent=True)
    eval_data = dict(list(filter(None, [
        ("train", train_data) if "train" in args.eval_splits else None,
        ("dev", valid_data) if "dev" in args.eval_splits else None,
        ("test", test_data) if "test" in args.eval_splits else None
    ])))
    for split, data in eval_data.items():
        eval_args = evaluate.EvaluateArugments(
            model=model,
            train_data=tuple(train_data),
            test_data=tuple(data),
            processor=processor,
            embed_type=args.embed_type,
            embed_path=args.embed_path,
            device=device,
            batch_size=args.valid_batch_size,
            beam_size=args.beam_size,
            max_conv_len=args.max_conv_len,
            max_sent_len=args.max_sent_len
        )
        utils.save_json(eval_args.to_json(),
                        eval_dir.joinpath(f"eval-{split}-args.json"))
        with torch.no_grad():
            eval_results = evaluate.evaluate(eval_args)
        save_path = eval_dir.joinpath(f"eval-{split}.json")
        utils.save_json(eval_results, save_path)
        logger.info(f"'{split}' results saved to {save_path}")
    logger.info("done!")
Ejemplo n.º 10
0
Archivo: gda.py Proyecto: kaniblu/vhda
def main(args=None):
    args = utils.parse_args(create_parser(), args)
    if args.logging_config is not None:
        logging.config.dictConfig(utils.load_yaml(args.logging_config))
    save_dir = pathlib.Path(args.save_dir)
    if (not args.overwrite and save_dir.exists()
            and utils.has_element(save_dir.glob("*"))):
        raise FileExistsError(f"save directory ({save_dir}) is not empty")
    shell = utils.ShellUtils()
    engine = inflect.engine()
    shell.mkdir(save_dir, silent=True)
    logger = logging.getLogger("gda")
    utils.seed(args.seed)
    logger.info("loading data...")
    load_fn = utils.chain_func(lambda data: list(map(Dialog.from_json, data)),
                               utils.load_json)
    processor = utils.load_pickle(args.processor_path)
    data_dir = pathlib.Path(args.data_dir)
    train_data = load_fn(str(data_dir.joinpath("train.json")))
    valid_data = load_fn(str(data_dir.joinpath("dev.json")))
    test_data = load_fn(str(data_dir.joinpath("test.json")))
    data = {"train": train_data, "dev": valid_data, "test": test_data}
    logger.info("preparing model...")
    torchmodels.register_packages(models)
    model_cls = torchmodels.create_model_cls(models, args.gen_model_path)
    model: models.AbstractTDA = model_cls(processor.vocabs)
    model.reset_parameters()
    ckpt = torch.load(args.ckpt_path)
    model.load_state_dict(ckpt)
    device = torch.device("cpu")
    if args.gpu is not None:
        device = torch.device(f"cuda:{args.gpu}")
    model = model.to(device)
    logger.info(f"will run {args.gen_runs} generation trials...")
    gen_summary = []
    dst_summary = []
    for gen_idx in range(1, args.gen_runs + 1):
        logger.info(f"running {engine.ordinal(gen_idx)} generation trial...")
        gen_dir = save_dir.joinpath(f"gen-{gen_idx:03d}")
        shell.mkdir(gen_dir, silent=True)
        gen_args = generate.GenerateArguments(
            model=model,
            processor=processor,
            data=tuple(train_data),
            instances=int(round(len(train_data) * args.multiplier)),
            batch_size=args.gen_batch_size,
            conv_scale=args.conv_scale,
            spkr_scale=args.spkr_scale,
            goal_scale=args.goal_scale,
            state_scale=args.state_scale,
            sent_scale=args.sent_scale,
            validate_dst=True,
            validate_unique=args.validate_unique,
            device=device)
        utils.save_json(gen_args.to_json(), gen_dir.joinpath("args.json"))
        with torch.no_grad():
            samples = generate.generate(gen_args)
        utils.save_json([sample.output.to_json() for sample in samples],
                        gen_dir.joinpath("out.json"))
        utils.save_json([sample.input.to_json() for sample in samples],
                        gen_dir.joinpath("in.json"))
        utils.save_lines([str(sample.log_prob) for sample in samples],
                         gen_dir.joinpath("logprob.txt"))
        da_data = [sample.output for sample in samples]
        gen_data = {
            "train": data["train"] + da_data,
            "dev": data["dev"],
            "test": data["test"]
        }
        # convert dialogs to dst dialogs
        gen_data = {
            split: list(map(datasets.DSTDialog.from_dialog, dialogs))
            for split, dialogs in gen_data.items()
        }
        for split, dialogs in gen_data.items():
            logger.info(f"verifying '{split}' dataset...")
            for dialog in dialogs:
                dialog.compute_user_goals()
                dialog.validate()
        logger.info("preparing dst environment...")
        dst_processor = dst_datasets.DSTDialogProcessor(
            sent_processor=datasets.SentProcessor(
                bos=True, eos=True, lowercase=True, max_len=30))
        dst_processor.prepare_vocabs(list(itertools.chain(*gen_data.values())))
        train_dataset = dst_datasets.DSTDialogDataset(
            dialogs=gen_data["train"], processor=dst_processor)
        train_dataloader = dst_datasets.create_dataloader(
            train_dataset,
            batch_size=args.dst_batch_size,
            shuffle=True,
            pin_memory=True)
        dev_dataloader = dst_run.TestDataloader(
            dialogs=gen_data["dev"],
            processor=dst_processor,
            max_batch_size=args.dst_batch_size)
        test_dataloader = dst_run.TestDataloader(
            dialogs=gen_data["test"],
            processor=dst_processor,
            max_batch_size=args.dst_batch_size)
        logger.info("saving dst processor object...")
        utils.save_pickle(dst_processor, gen_dir.joinpath("processor.pkl"))
        torchmodels.register_packages(dst_models)
        dst_model_cls = torchmodels.create_model_cls(dst_pkg,
                                                     args.dst_model_path)
        dst_model = dst_model_cls(dst_processor.vocabs)
        dst_model = dst_model.to(device)
        logger.info(str(model))
        logger.info(f"number of parameters DST: "
                    f"{utils.count_parameters(dst_model):,d}")
        logger.info(f"will run {args.dst_runs} trials...")
        all_results = []
        for idx in range(1, args.dst_runs + 1):
            logger.info(f"running {engine.ordinal(idx)} dst trial...")
            trial_dir = gen_dir.joinpath(f"dst-{idx:03d}")
            logger.info("resetting parameters...")
            dst_model.reset_parameters()
            logger.info("preparing trainer...")
            runner = dst_run.Runner(
                model=dst_model,
                processor=dst_processor,
                device=device,
                save_dir=trial_dir,
                epochs=int(round(args.epochs / (1 + args.multiplier))),
                loss="sum",
                l2norm=args.l2norm,
                gradient_clip=args.gradient_clip,
                train_validate=False,
                early_stop=True,
                early_stop_criterion="joint-goal",
                early_stop_patience=None,
                asr_method="scaled",
                asr_sigmoid_sum_order="sigmoid-sum",
                asr_topk=5)

            logger.info("commencing training...")
            record = runner.train(train_dataloader=train_dataloader,
                                  dev_dataloader=dev_dataloader,
                                  test_fn=None)
            logger.info("final summary: ")
            logger.info(pprint.pformat(record.to_json()))
            utils.save_json(record.to_json(),
                            trial_dir.joinpath("summary.json"))
            if not args.test_asr:
                logger.info("commencing testing...")
                with torch.no_grad():
                    eval_results = runner.test(test_dataloader)
                logger.info("test results: ")
                logger.info(pprint.pformat(eval_results))
            else:
                logger.info("commencing testing (asr)...")
                with torch.no_grad():
                    eval_results = runner.test_asr(test_dataloader)
                logger.info("test(asr) results: ")
                logger.info(pprint.pformat(eval_results))
            eval_results["epoch"] = int(record.epoch)
            eval_results["criterion"] = record.value
            logger.info("test evaluation: ")
            logger.info(pprint.pformat(eval_results))
            utils.save_json(eval_results, trial_dir.joinpath("eval.json"))
            all_results.append(eval_results)
            dst_summary.append(eval_results)
        logger.info("aggregating results...")
        summary = reduce_json(all_results)
        logger.info("aggregated results: ")
        agg_results = {k: v["stats"]["mean"] for k, v in summary.items()}
        gen_summary.append(agg_results)
        logger.info(pprint.pformat(agg_results))
        utils.save_json(summary, gen_dir.joinpath("summary.json"))
    gen_summary = reduce_json(gen_summary)
    dst_summary = reduce_json(dst_summary)
    logger.info(f"aggregating generation trials ({args.gen_runs})...")
    logger.info(
        pprint.pformat({k: v["stats"]["mean"]
                        for k, v in gen_summary.items()}))
    logger.info(f"aggregating dst trials ({args.gen_runs * args.dst_runs})...")
    logger.info(
        pprint.pformat({k: v["stats"]["mean"]
                        for k, v in dst_summary.items()}))
    utils.save_json(gen_summary, save_dir.joinpath("gen-summary.json"))
    utils.save_json(dst_summary, save_dir.joinpath("dst-summary.json"))
    logger.info("done!")
Ejemplo n.º 11
0
Archivo: woz.py Proyecto: kaniblu/vhda
 def save_imp(self, dat: Mapping[str, Sequence[Dialog]],
              path: pathlib.Path):
     shell = utils.ShellUtils()
     shell.mkdir(path, True)
     for split in ("train", "dev", "test"):
         self.save_json(dat[split], path.joinpath(f"{split}.json"))
Ejemplo n.º 12
0
def main(args=None):
    args = utils.parse_args(create_parser(), args)
    if args.logging_config is not None:
        logging.config.dictConfig(utils.load_yaml(args.logging_config))
    save_dir = pathlib.Path(args.save_dir)
    if (not args.overwrite and save_dir.exists()
            and utils.has_element(save_dir.glob("*.json"))):
        raise FileExistsError(f"save directory ({save_dir}) is not empty")
    shell = utils.ShellUtils()
    shell.mkdir(save_dir, silent=True)
    logger = logging.getLogger("interpolate")
    data_dir = pathlib.Path(args.data_dir)
    data = {
        split: list(
            map(Dialog.from_json,
                utils.load_json(data_dir.joinpath(f"{split}.json"))))
        for split in set(args.splits)
    }
    processor: DialogProcessor = utils.load_pickle(args.processor_path)
    logger.info("preparing model...")
    torchmodels.register_packages(models)
    model_cls = torchmodels.create_model_cls(models, args.model_path)
    model: models.AbstractTDA = model_cls(processor.vocabs)
    model.reset_parameters()
    model.load_state_dict(torch.load(args.ckpt_path))
    device = torch.device("cpu")
    if args.gpu is not None:
        device = torch.device(f"cuda:{args.gpu}")
    model = model.to(device)
    samples = (sample_data(data,
                           args.anchor1), sample_data(data, args.anchor2))
    formatter = utils.DialogTableFormatter()
    logger.info(f"first sample: \n{formatter.format(samples[0])}")
    logger.info(f"second sample: \n{formatter.format(samples[1])}")
    logger.info("preparing environment...")
    dataloader = datasets.create_dataloader(dataset=datasets.DialogDataset(
        data=samples, processor=processor),
                                            batch_size=1,
                                            shuffle=False,
                                            pin_memory=False)
    inferencer = InterpolateInferencer(model=model,
                                       processor=processor,
                                       device=device)
    logger.info("interpolating...")
    with torch.no_grad():
        zconv_a, zconv_b = inferencer.encode(dataloader)
        zconv = torch.stack([
            zconv_a + (zconv_b - zconv_a) / args.steps * i
            for i in range(args.steps + 1)
        ])
        gen_samples = inferencer.generate(td.DataLoader(zconv, shuffle=False))
    # use original data points for two extremes
    samples = [samples[0]] + list(gen_samples[1:-1]) + [samples[1]]
    logger.info("interpolation results: ")
    for i, sample in enumerate(samples):
        logger.info(f"interpolation step {i / args.steps:.2%}: \n"
                    f"{formatter.format(sample)}")
    logger.info("saving results...")
    json_dir = save_dir.joinpath("json")
    json_dir.mkdir(exist_ok=True)
    for i, sample in enumerate(samples, 1):
        utils.save_json(sample.to_json(), json_dir.joinpath(f"{i:02d}.json"))
    tbl_dir = save_dir.joinpath("table")
    tbl_dir.mkdir(exist_ok=True)
    for i, sample in enumerate(samples, 1):
        utils.save_lines([formatter.format(sample)],
                         tbl_dir.joinpath(f"{i:02d}.txt"))
    ltx_dir = save_dir.joinpath("latex")
    ltx_dir.mkdir(exist_ok=True)
    ltx_formatter = utils.DialogICMLLatexFormatter()
    for i, sample in enumerate(samples, 1):
        utils.save_lines([ltx_formatter.format(sample)],
                         ltx_dir.joinpath(f"{i:02d}.tex"))
    logger.info("done!")
Ejemplo n.º 13
0
def train(args: TrainArguments) -> Record:
    model, device = args.model, args.device
    save_dir = args.save_dir
    shell = utils.ShellUtils()
    shell.mkdir(save_dir, silent=True)
    utils.save_json(args.to_json(), str(save_dir.joinpath("args.json")))
    logger = logging.getLogger("train")
    processor = args.processor
    vocabs: datasets.VocabSet = processor.vocabs
    train_dataset = datasets.DialogDataset(
        data=args.train_data,
        processor=processor
    )
    valid_dataset = datasets.DialogDataset(
        data=args.valid_data,
        processor=processor
    )
    logger.info("preparing training environment...")
    loss = create_loss(
        model=model,
        vocabs=vocabs,
        kld_weight=args.kld_schedule,
        enable_kl=not args.disable_kl,
        kl_mode=args.kl_mode
    )
    if args.optimizer == "adam":
        op_cls = op.Adam
    else:
        raise ValueError(f"unsupported optimizer: {args.optimizer}")
    fval_cls = FinegrainedValidator
    fval_kwg = dict(
        model=model,
        processor=processor,
        device=device,
        evaluators=list(filter(None, (
            SpeakerEvaluator(vocabs.speaker),
            DialogStateEvaluator(vocabs),
            PosteriorEvaluator()
        ))),
        report_every=None,
        run_end_report=False,
        progress_stat="loss",
        loss=loss
    )
    if isinstance(model, models.VHDA):
        @dataclass
        class VHDAValidator(VHDAInferencer, fval_cls):
            pass

        fval_cls = VHDAValidator
        fval_kwg.update(dict(
            sample_scale=1.0
        ))
    fval = fval_cls(**fval_kwg)

    gval_cls = GenerativeValidator
    gval_kwg = dict(
        model=model,
        processor=processor,
        batch_size=args.valid_batch_size,
        device=device,
        evaluators=list(filter(None, [
            DistinctEvaluator(vocabs),
            SentLengthEvaluator(vocabs),
            RougeEvaluator(vocabs),
            DialogLengthEvaluator(),
            WordEntropyEvaluator(train_dataset)
        ])),
        report_every=None,
        run_end_report=False,
        beam_size=args.beam_size,
        max_sent_len=args.max_gen_len
    )
    gval = gval_cls(**gval_kwg)

    trainer_cls = Trainer
    trainer_kwargs = dict(
        model=model,
        processor=processor,
        device=device,
        writer=torch.utils.tensorboard.SummaryWriter(
            log_dir=str(args.save_dir)
        ),
        evaluators=list(filter(None, (
            SpeakerEvaluator(vocabs.speaker),
            DialogStateEvaluator(vocabs)
        ))),
        progress_stat="loss",
        display_stats={"loss", "kld", "goal-acc-turn-user",
                       "rouge-l-f1", "conv-mi", "nll", "conv-len"},
        report_every=args.report_every,
        stats_formatter=utils.StatsFormatter(num_cols=3),
        dialog_formatter=utils.DialogTableFormatter(
            max_col_len=50
        ),
        loss=loss,
        optimizer_cls=functools.partial(
            op_cls,
            lr=args.learning_rate
        ),
        grad_clip=args.gradient_clip,
        l2norm=args.l2norm_weight,
        save_dir=pathlib.Path(args.save_dir),
        num_epochs=args.num_epochs,
        fin_valid=fval,
        gen_valid=gval,
        validate_every=args.validate_every,
        early_stop=args.early_stop,
        early_stop_criterion=args.early_stop_criterion,
        early_stop_patience=args.early_stop_patience,
        save_every=args.save_every
    )
    if isinstance(model, models.VHDA):
        @dataclass
        class VHDATrainer(VHDAInferencer, trainer_cls):
            pass

        trainer_cls = VHDATrainer
        trainer_kwargs.update(dict(
            dropout_scale=args.dropout_schedule
        ))
    trainer = trainer_cls(**trainer_kwargs)
    train_dataloader = datasets.create_dataloader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        pin_memory=True,
        drop_last=False
    )
    valid_dataloader = datasets.create_dataloader(
        valid_dataset,
        batch_size=args.valid_batch_size,
        shuffle=False,
        pin_memory=True,
        drop_last=False
    )
    logger.info("commencing training...")
    record = trainer.train(train_dataloader, valid_dataloader)
    logger.info(f"final summary: {pprint.pformat(record.to_short_json())}")
    logger.info("done!")
    return record