コード例 #1
0
    def train(self, config, **kwargs):
        """Trains a model on the given configurations.
        :param config: A training configuration. Note that all parameters in the config can also be manually adjusted with --ARG=VALUE
        :param **kwargs: parameters to overwrite yaml config
        """
        from pycocoevalcap.cider.cider import Cider

        config_parameters = train_util.parse_config_or_kwargs(config, **kwargs)
        config_parameters["seed"] = self.seed
        outputdir = os.path.join(
            config_parameters["outputpath"], config_parameters["model"],
            "{}_{}".format(
                datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'),
                uuid.uuid1().hex))

        # Early init because of creating dir
        checkpoint_handler = ModelCheckpoint(
            outputdir,
            "run",
            n_saved=1,
            require_empty=False,
            create_dir=True,
            score_function=lambda engine: engine.state.metrics["score"],
            score_name="score")

        logger = train_util.genlogger(os.path.join(outputdir, "train.log"))
        # print passed config parameters
        logger.info("Storing files in: {}".format(outputdir))
        train_util.pprint_dict(config_parameters, logger.info)

        zh = config_parameters["zh"]
        vocabulary = torch.load(config_parameters["vocab_file"])
        train_loader, cv_loader, info = self._get_dataloaders(
            config_parameters, vocabulary)
        config_parameters["inputdim"] = info["inputdim"]
        cv_key2refs = info["cv_key2refs"]
        logger.info("<== Estimating Scaler ({}) ==>".format(
            info["scaler"].__class__.__name__))
        logger.info("Feature: {} Input dimension: {} Vocab Size: {}".format(
            config_parameters["feature_file"], info["inputdim"],
            len(vocabulary)))

        model = self._get_model(config_parameters, len(vocabulary))
        if "pretrained_word_embedding" in config_parameters:
            embeddings = np.load(
                config_parameters["pretrained_word_embedding"])
            model.load_word_embeddings(
                embeddings,
                tune=config_parameters["tune_word_embedding"],
                projection=True)
        model = model.to(self.device)
        train_util.pprint_dict(model, logger.info, formatter="pretty")
        optimizer = getattr(torch.optim, config_parameters["optimizer"])(
            model.parameters(), **config_parameters["optimizer_args"])
        train_util.pprint_dict(optimizer, logger.info, formatter="pretty")

        criterion = torch.nn.CrossEntropyLoss().to(self.device)
        crtrn_imprvd = train_util.criterion_improver(
            config_parameters['improvecriterion'])

        def _train_batch(engine, batch):
            model.train()
            with torch.enable_grad():
                optimizer.zero_grad()
                output = self._forward(model, batch, "train")
                loss = criterion(output["packed_logits"],
                                 output["targets"]).to(self.device)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()
                output["loss"] = loss.item()
                return output

        trainer = Engine(_train_batch)
        RunningAverage(output_transform=lambda x: x["loss"]).attach(
            trainer, "running_loss")
        pbar = ProgressBar(persist=False, ascii=True, ncols=100)
        pbar.attach(trainer, ["running_loss"])

        key2pred = {}

        def _inference(engine, batch):
            model.eval()
            keys = batch[2]
            with torch.no_grad():
                output = self._forward(model, batch, "validation")
                seqs = output["seqs"].cpu().numpy()
                for (idx, seq) in enumerate(seqs):
                    if keys[idx] in key2pred:
                        continue
                    candidate = self._convert_idx2sentence(seq, vocabulary, zh)
                    key2pred[keys[idx]] = [
                        candidate,
                    ]
                return output

        metrics = {
            "loss":
            Loss(criterion,
                 output_transform=lambda x: (x["packed_logits"], x["targets"]))
        }

        evaluator = Engine(_inference)

        def eval_cv(engine, key2pred, key2refs):
            scorer = Cider(zh=zh)
            score, scores = scorer.compute_score(key2refs, key2pred)
            engine.state.metrics["score"] = score
            key2pred.clear()

        evaluator.add_event_handler(Events.EPOCH_COMPLETED, eval_cv, key2pred,
                                    cv_key2refs)

        for name, metric in metrics.items():
            metric.attach(evaluator, name)

        trainer.add_event_handler(Events.EPOCH_COMPLETED,
                                  train_util.log_results, evaluator, cv_loader,
                                  logger.info, ["loss", "score"])

        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, train_util.save_model_on_improved,
            crtrn_imprvd, "score", {
                "model": model.state_dict(),
                "config": config_parameters,
                "scaler": info["scaler"]
            }, os.path.join(outputdir, "saved.pth"))

        scheduler = getattr(torch.optim.lr_scheduler,
                            config_parameters["scheduler"])(
                                optimizer,
                                **config_parameters["scheduler_args"])
        evaluator.add_event_handler(Events.EPOCH_COMPLETED,
                                    train_util.update_lr, scheduler, "score")

        evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                    {
                                        "model": model,
                                    })

        trainer.run(train_loader, max_epochs=config_parameters["epochs"])
        return outputdir
コード例 #2
0
ファイル: run_scst.py プロジェクト: wsntxxn/AudioCaption
    def train(self, config, **kwargs):
        """Trains a model on the given configurations.
        :param config:str: A training configuration. Note that all parameters in the config can also be manually adjusted with --ARG=VALUE
        :param **kwargs: parameters to overwrite yaml config
        """

        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.spider.spider import Spider

        conf = train_util.parse_config_or_kwargs(config, **kwargs)
        conf["seed"] = self.seed
        zh = conf["zh"]
        outputdir = os.path.join(
            conf["outputpath"], conf["modelwrapper"],
            # "{}_{}".format(
                # datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'),
                # uuid.uuid1().hex))
            conf["remark"], "seed_{}".format(self.seed)
        )

        # Early init because of creating dir
        checkpoint_handler = ModelCheckpoint(
            outputdir,
            "run",
            n_saved=1,
            require_empty=False,
            create_dir=True,
            score_function=lambda engine: engine.state.metrics["score"],
            score_name="score")

        logger = train_util.genlogger(os.path.join(outputdir, "train.log"))
        # print passed config parameters
        logger.info("Storing files in: {}".format(outputdir))
        train_util.pprint_dict(conf, logger.info)

        vocabulary = torch.load(conf["vocab_file"])
        train_loader, val_loader, info = self._get_dataloaders(conf, vocabulary)
        conf["inputdim"] = info["inputdim"]
        logger.info("<== Estimating Scaler ({}) ==>".format(info["scaler"].__class__.__name__))
        logger.info(
                "Feature: {} Input dimension: {} Vocab Size: {}".format(
                conf["feature_file"], info["inputdim"], len(vocabulary)))
        train_key2refs = info["train_key2refs"]
        val_key2refs = info["val_key2refs"]

        model = self._get_model(conf, vocabulary)
        model = model.to(self.device)
        train_util.pprint_dict(model, logger.info, formatter="pretty")
        optimizer = getattr(
            torch.optim, conf["optimizer"]
        )(model.parameters(), **conf["optimizer_args"])
        train_util.pprint_dict(optimizer, logger.info, formatter="pretty")

        crtrn_imprvd = train_util.criterion_improver(conf["improvecriterion"])

        scorer_dict = {"cider": Cider(zh=zh), "spider": Spider()}
        if "train_scorer" not in conf:
            conf["train_scorer"] = "cider"
        train_scorer = scorer_dict[conf["train_scorer"]]
        def _train_batch(engine, batch):
            # import pdb; pdb.set_trace()
            # set num batch tracked?
            model.train()
            with torch.enable_grad():
                optimizer.zero_grad()
                # train_scorer = scorer_dict[conf["train_scorer"]]
                output = self._forward(model, batch, "train", 
                                       key2refs=train_key2refs, 
                                       scorer=train_scorer,
                                       vocabulary=vocabulary)
                output["loss"].backward()
                optimizer.step()
                return output

        trainer = Engine(_train_batch)
        RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss")
        pbar = ProgressBar(persist=False, ascii=True)
        pbar.attach(trainer, ["running_loss"])

        key2pred = {}

        def _inference(engine, batch):
            model.eval()
            keys = batch[2]
            with torch.no_grad():
                # val_scorer = Cider(zh=zh)
                # output = self._forward(model, batch, "train", 
                                       # key2refs=val_key2refs, scorer=val_scorer)
                # seqs = output["greedy_seqs"].cpu().numpy()
                output = self._forward(model, batch, "validation")
                seqs = output["seqs"].cpu().numpy()
                for idx, seq in enumerate(seqs):
                    if keys[idx] in key2pred:
                        continue
                    candidate = self._convert_idx2sentence(seq, vocabulary, zh=zh)
                    key2pred[keys[idx]] = [candidate,]
                return output

        evaluator = Engine(_inference)
        RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss")

        metrics = {
            "loss": Average(output_transform=lambda x: x["loss"]),
            "reward": Average(output_transform=lambda x: x["reward"].reshape(-1, 1)),
            # "score": Average(output_transform=lambda x: x["score"].reshape(-1, 1)),
        }

        for name, metric in metrics.items():
            metric.attach(trainer, name)
            # metric.attach(evaluator, name)

        # RunningAverage(output_transform=lambda x: x["loss"]).attach(evaluator, "running_loss")
        # pbar.attach(evaluator, ["running_loss"])
        pbar.attach(evaluator) 

        trainer.add_event_handler(
              Events.EPOCH_COMPLETED, train_util.log_results, evaluator, val_loader,
              logger.info, metrics.keys(), ["score"])

        def eval_val(engine, key2pred, key2refs, scorer):
            score, scores = scorer.compute_score(key2refs, key2pred)
            engine.state.metrics["score"] = score
            key2pred.clear()

        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, eval_val, key2pred, val_key2refs, Cider(zh=zh))

        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, train_util.save_model_on_improved, crtrn_imprvd,
            "score", {
                "model": model.state_dict(),
                "config": conf,
                "scaler": info["scaler"]
            }, os.path.join(outputdir, "saved.pth"))

        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler, {
                "model": model,
            }
        )

        trainer.run(train_loader, max_epochs=conf["epochs"])
        return outputdir
コード例 #3
0
    def train(self, config, **kwargs):
        """Trains a model on the given configurations.
        :param config:str: A training configuration. Note that all parameters in the config can also be manually adjusted with --ARG=VALUE
        :param **kwargs: parameters to overwrite yaml config
        """

        from pycocoevalcap.cider.cider import Cider

        config_parameters = train_util.parse_config_or_kwargs(config, **kwargs)
        config_parameters["seed"] = self.seed
        zh = config_parameters["zh"]
        outputdir = os.path.join(
            config_parameters["outputpath"], config_parameters["model"],
            "{}_{}".format(
                datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'),
                uuid.uuid1().hex))

        # Early init because of creating dir
        checkpoint_handler = ModelCheckpoint(
            outputdir,
            "run",
            n_saved=1,
            require_empty=False,
            create_dir=True,
            score_function=lambda engine: -engine.state.metrics["loss"],
            score_name="loss")

        logger = train_util.genlogger(os.path.join(outputdir, "train.log"))
        # print passed config parameters
        logger.info("Storing files in: {}".format(outputdir))
        train_util.pprint_dict(config_parameters, logger.info)

        vocabulary = torch.load(config_parameters["vocab_file"])
        trainloader, cvloader, info = self._get_dataloaders(config_parameters, vocabulary)
        config_parameters["inputdim"] = info["inputdim"]
        logger.info("<== Estimating Scaler ({}) ==>".format(info["scaler"].__class__.__name__))
        logger.info(
                "Stream: {} Input dimension: {} Vocab Size: {}".format(
                config_parameters["feature_stream"], info["inputdim"], len(vocabulary)))
        train_key2refs = info["train_key2refs"]
        # train_scorer = BatchCider(train_key2refs)
        cv_key2refs = info["cv_key2refs"]
        # cv_scorer = BatchCider(cv_key2refs)

        model = self._get_model(config_parameters, vocabulary)
        model = model.to(device)
        train_util.pprint_dict(model, logger.info, formatter="pretty")
        optimizer = getattr(
            torch.optim, config_parameters["optimizer"]
        )(model.parameters(), **config_parameters["optimizer_args"])
        train_util.pprint_dict(optimizer, logger.info, formatter="pretty")

        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            # optimizer, **config_parameters["scheduler_args"])
        crtrn_imprvd = train_util.criterion_improver(config_parameters["improvecriterion"])

        def _train_batch(engine, batch):
            model.train()
            with torch.enable_grad():
                optimizer.zero_grad()
                train_scorer = Cider(zh=zh)
                output = self._forward(model, batch, "train", train_mode="scst", 
                                       key2refs=train_key2refs, scorer=train_scorer)
                output["loss"].backward()
                optimizer.step()
                return output

        trainer = Engine(_train_batch)
        RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss")
        pbar = ProgressBar(persist=False, ascii=True)
        pbar.attach(trainer, ["running_loss"])

        key2pred = {}

        def _inference(engine, batch):
            model.eval()
            keys = batch[2]
            with torch.no_grad():
                cv_scorer = Cider(zh=zh)
                output = self._forward(model, batch, "train", train_mode="scst",
                                       key2refs=cv_key2refs, scorer=cv_scorer)
                seqs = output["sampled_seqs"].cpu().numpy()
                for idx, seq in enumerate(seqs):
                    if keys[idx] in key2pred:
                        continue
                    candidate = self._convert_idx2sentence(seq, vocabulary, zh=zh)
                    key2pred[keys[idx]] = [candidate,]
                return output

        evaluator = Engine(_inference)
        RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss")

        metrics = {
            "loss": Average(output_transform=lambda x: x["loss"]),
            "reward": Average(output_transform=lambda x: x["reward"].reshape(-1, 1)),
        }

        for name, metric in metrics.items():
            metric.attach(trainer, name)
            metric.attach(evaluator, name)

        RunningAverage(output_transform=lambda x: x["loss"]).attach(evaluator, "running_loss")
        pbar.attach(evaluator, ["running_loss"])

        # @trainer.on(Events.STARTED)
        # def log_initial_result(engine):
            # evaluator.run(cvloader, max_epochs=1)
            # logger.info("Initial Results - loss: {:<5.2f}\tscore: {:<5.2f}".format(evaluator.state.metrics["loss"], evaluator.state.metrics["score"].item()))


        trainer.add_event_handler(
              Events.EPOCH_COMPLETED, train_util.log_results, evaluator, cvloader,
              logger.info, metrics.keys(), ["loss", "reward", "score"])

        def eval_cv(engine, key2pred, key2refs, scorer):
            # if len(cv_key2refs) == 0:
                # for key, _ in key2pred.items():
                    # cv_key2refs[key] = key2refs[key]
            score, scores = scorer.compute_score(key2refs, key2pred)
            engine.state.metrics["score"] = score
            key2pred.clear()

        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, eval_cv, key2pred, cv_key2refs, Cider(zh=zh))

        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, train_util.save_model_on_improved, crtrn_imprvd,
            "score", {
                "model": model,
                "config": config_parameters,
                "scaler": info["scaler"]
            }, os.path.join(outputdir, "saved.pth"))

        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler, {
                "model": model,
            }
        )

        trainer.run(trainloader, max_epochs=config_parameters["epochs"])
        return outputdir
コード例 #4
0
ファイル: run.py プロジェクト: wsntxxn/AudioCaption
    def train(self, config, **kwargs):
        """Trains a model on the given configurations.
        :param config: A training configuration. Note that all parameters in the config can also be manually adjusted with --ARG=VALUE
        :param **kwargs: parameters to overwrite yaml config
        """
        from pycocoevalcap.cider.cider import Cider

        conf = train_util.parse_config_or_kwargs(config, **kwargs)
        conf["seed"] = self.seed
        
        assert "distributed" in conf

        if conf["distributed"]:
            torch.distributed.init_process_group(backend="nccl")
            self.local_rank = torch.distributed.get_rank()
            self.world_size = torch.distributed.get_world_size()
            assert kwargs["local_rank"] == self.local_rank
            torch.cuda.set_device(self.local_rank)
            self.device = torch.device("cuda", self.local_rank)
            # self.group = torch.distributed.new_group()

        if not conf["distributed"] or not self.local_rank:
            outputdir = str(
                Path(conf["outputpath"]) / 
                conf["model"] /
                # "{}_{}".format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%m"),
                               # uuid.uuid1().hex)
                conf["remark"] /
                "seed_{}".format(self.seed)
            )

            Path(outputdir).mkdir(parents=True, exist_ok=True)
            # # Early init because of creating dir
            # checkpoint_handler = ModelCheckpoint(
                # outputdir,
                # "run",
                # n_saved=1,
                # require_empty=False,
                # create_dir=False,
                # score_function=lambda engine: engine.state.metrics["score"],
                # score_name="score")

            logger = train_util.genlogger(str(Path(outputdir) / "train.log"))
            # print passed config parameters
            if "SLURM_JOB_ID" in os.environ:
                logger.info("Slurm job id: {}".format(os.environ["SLURM_JOB_ID"]))
            logger.info("Storing files in: {}".format(outputdir))
            train_util.pprint_dict(conf, logger.info)

        zh = conf["zh"]
        vocabulary = pickle.load(open(conf["vocab_file"], "rb"))
        dataloaders = self._get_dataloaders(conf, vocabulary)
        train_dataloader = dataloaders["train_dataloader"]
        val_dataloader = dataloaders["val_dataloader"]
        val_key2refs = dataloaders["val_key2refs"]
        data_dim = train_dataloader.dataset.data_dim
        conf["input_dim"] = data_dim
        if not conf["distributed"] or not self.local_rank:
            feature_data = conf["h5_csv"] if "h5_csv" in conf else conf["train_h5_csv"]
            logger.info(
                "Feature: {} Input dimension: {} Vocab Size: {}".format(
                    feature_data, data_dim, len(vocabulary)))

        model = self._get_model(conf, len(vocabulary))
        model = model.to(self.device)
        if conf["distributed"]:
            model = torch.nn.parallel.distributed.DistributedDataParallel(
                model, device_ids=[self.local_rank,], output_device=self.local_rank,
                find_unused_parameters=True)
        optimizer = getattr(
            torch.optim, conf["optimizer"]
        )(model.parameters(), **conf["optimizer_args"])

        if not conf["distributed"] or not self.local_rank:
            train_util.pprint_dict(model, logger.info, formatter="pretty")
            train_util.pprint_dict(optimizer, logger.info, formatter="pretty")

        if conf["label_smoothing"]:
            criterion = train_util.LabelSmoothingLoss(len(vocabulary), smoothing=conf["smoothing"])
        else:
            criterion = torch.nn.CrossEntropyLoss().to(self.device)
        crtrn_imprvd = train_util.criterion_improver(conf['improvecriterion'])

        def _train_batch(engine, batch):
            if conf["distributed"]:
                train_dataloader.sampler.set_epoch(engine.state.epoch)
            model.train()
            with torch.enable_grad():
                optimizer.zero_grad()
                output = self._forward(
                    model, batch, "train",
                    ss_ratio=conf["ss_args"]["ss_ratio"]
                )
                loss = criterion(output["packed_logits"], output["targets"]).to(self.device)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), conf["max_grad_norm"])
                optimizer.step()
                output["loss"] = loss.item()
                return output

        trainer = Engine(_train_batch)
        RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss")
        pbar = ProgressBar(persist=False, ascii=True, ncols=100)
        pbar.attach(trainer, ["running_loss"])

        key2pred = {}

        def _inference(engine, batch):
            model.eval()
            keys = batch[0]
            with torch.no_grad():
                output = self._forward(model, batch, "validation")
                seqs = output["seqs"].cpu().numpy()
                for (idx, seq) in enumerate(seqs):
                    candidate = self._convert_idx2sentence(seq, vocabulary, zh)
                    key2pred[keys[idx]] = [candidate,]
                return output

        metrics = {
            "loss": Loss(criterion, output_transform=lambda x: (x["packed_logits"], x["targets"])),
            "accuracy": Accuracy(output_transform=lambda x: (x["packed_logits"], x["targets"])),
        }
        for name, metric in metrics.items():
            metric.attach(trainer, name)

        evaluator = Engine(_inference)

        def eval_val(engine, key2pred, key2refs):
            scorer = Cider(zh=zh)
            score_output = self._eval_prediction(key2refs, key2pred, [scorer])
            engine.state.metrics["score"] = score_output["CIDEr"]
            key2pred.clear()

        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, eval_val, key2pred, val_key2refs)

        pbar.attach(evaluator)

        # Learning rate scheduler
        if "scheduler" in conf:
            try:
                scheduler = getattr(torch.optim.lr_scheduler, conf["scheduler"])(
                    optimizer, **conf["scheduler_args"])
            except AttributeError:
                import utils.lr_scheduler
                if conf["scheduler"] == "ExponentialDecayScheduler":
                    conf["scheduler_args"]["total_iters"] = len(train_dataloader) * conf["epochs"]
                scheduler = getattr(utils.lr_scheduler, conf["scheduler"])(
                    optimizer, **conf["scheduler_args"])
            if scheduler.__class__.__name__ in ["StepLR", "ReduceLROnPlateau", "ExponentialLR", "MultiStepLR"]:
                evaluator.add_event_handler(
                    Events.EPOCH_COMPLETED, train_util.update_lr,
                    scheduler, "score")
            else:
                trainer.add_event_handler(
                    Events.ITERATION_COMPLETED, train_util.update_lr, scheduler, None)
        
        # Scheduled sampling
        if conf["ss"]:
            trainer.add_event_handler(
                Events.GET_BATCH_COMPLETED, train_util.update_ss_ratio, conf, len(train_dataloader))

        #########################
        # Events for main process: mostly logging and saving
        #########################
        if not conf["distributed"] or not self.local_rank:
            # logging training and validation loss and metrics
            trainer.add_event_handler(
                Events.EPOCH_COMPLETED, train_util.log_results, optimizer, evaluator, val_dataloader,
                logger.info, metrics.keys(), ["score"])
            # saving best model
            evaluator.add_event_handler(
                Events.EPOCH_COMPLETED, train_util.save_model_on_improved, crtrn_imprvd,
                "score", {
                    "model": model.state_dict() if not conf["distributed"] else model.module.state_dict(),
                    # "config": conf,
                    "optimizer": optimizer.state_dict(),
                    "lr_scheduler": scheduler.state_dict()
                }, str(Path(outputdir) / "saved.pth")
            )
            # regular checkpoint
            checkpoint_handler = ModelCheckpoint(
                outputdir,
                "run",
                n_saved=1,
                require_empty=False,
                create_dir=False,
                score_function=lambda engine: engine.state.metrics["score"],
                score_name="score")
            evaluator.add_event_handler(
                Events.EPOCH_COMPLETED, checkpoint_handler, {
                    "model": model,
                }
            )
            # dump configuration
            train_util.store_yaml(conf, str(Path(outputdir) / "config.yaml"))

        #########################
        # Start training
        #########################
        trainer.run(train_dataloader, max_epochs=conf["epochs"])
        if not conf["distributed"] or not self.local_rank:
            return outputdir