def train(self, config, **kwargs): """Trains a model on the given configurations. :param config: A training configuration. Note that all parameters in the config can also be manually adjusted with --ARG=VALUE :param **kwargs: parameters to overwrite yaml config """ from pycocoevalcap.cider.cider import Cider config_parameters = train_util.parse_config_or_kwargs(config, **kwargs) config_parameters["seed"] = self.seed outputdir = os.path.join( config_parameters["outputpath"], config_parameters["model"], "{}_{}".format( datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'), uuid.uuid1().hex)) # Early init because of creating dir checkpoint_handler = ModelCheckpoint( outputdir, "run", n_saved=1, require_empty=False, create_dir=True, score_function=lambda engine: engine.state.metrics["score"], score_name="score") logger = train_util.genlogger(os.path.join(outputdir, "train.log")) # print passed config parameters logger.info("Storing files in: {}".format(outputdir)) train_util.pprint_dict(config_parameters, logger.info) zh = config_parameters["zh"] vocabulary = torch.load(config_parameters["vocab_file"]) train_loader, cv_loader, info = self._get_dataloaders( config_parameters, vocabulary) config_parameters["inputdim"] = info["inputdim"] cv_key2refs = info["cv_key2refs"] logger.info("<== Estimating Scaler ({}) ==>".format( info["scaler"].__class__.__name__)) logger.info("Feature: {} Input dimension: {} Vocab Size: {}".format( config_parameters["feature_file"], info["inputdim"], len(vocabulary))) model = self._get_model(config_parameters, len(vocabulary)) if "pretrained_word_embedding" in config_parameters: embeddings = np.load( config_parameters["pretrained_word_embedding"]) model.load_word_embeddings( embeddings, tune=config_parameters["tune_word_embedding"], projection=True) model = model.to(self.device) train_util.pprint_dict(model, logger.info, formatter="pretty") optimizer = getattr(torch.optim, config_parameters["optimizer"])( model.parameters(), **config_parameters["optimizer_args"]) train_util.pprint_dict(optimizer, logger.info, formatter="pretty") criterion = torch.nn.CrossEntropyLoss().to(self.device) crtrn_imprvd = train_util.criterion_improver( config_parameters['improvecriterion']) def _train_batch(engine, batch): model.train() with torch.enable_grad(): optimizer.zero_grad() output = self._forward(model, batch, "train") loss = criterion(output["packed_logits"], output["targets"]).to(self.device) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() output["loss"] = loss.item() return output trainer = Engine(_train_batch) RunningAverage(output_transform=lambda x: x["loss"]).attach( trainer, "running_loss") pbar = ProgressBar(persist=False, ascii=True, ncols=100) pbar.attach(trainer, ["running_loss"]) key2pred = {} def _inference(engine, batch): model.eval() keys = batch[2] with torch.no_grad(): output = self._forward(model, batch, "validation") seqs = output["seqs"].cpu().numpy() for (idx, seq) in enumerate(seqs): if keys[idx] in key2pred: continue candidate = self._convert_idx2sentence(seq, vocabulary, zh) key2pred[keys[idx]] = [ candidate, ] return output metrics = { "loss": Loss(criterion, output_transform=lambda x: (x["packed_logits"], x["targets"])) } evaluator = Engine(_inference) def eval_cv(engine, key2pred, key2refs): scorer = Cider(zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) engine.state.metrics["score"] = score key2pred.clear() evaluator.add_event_handler(Events.EPOCH_COMPLETED, eval_cv, key2pred, cv_key2refs) for name, metric in metrics.items(): metric.attach(evaluator, name) trainer.add_event_handler(Events.EPOCH_COMPLETED, train_util.log_results, evaluator, cv_loader, logger.info, ["loss", "score"]) evaluator.add_event_handler( Events.EPOCH_COMPLETED, train_util.save_model_on_improved, crtrn_imprvd, "score", { "model": model.state_dict(), "config": config_parameters, "scaler": info["scaler"] }, os.path.join(outputdir, "saved.pth")) scheduler = getattr(torch.optim.lr_scheduler, config_parameters["scheduler"])( optimizer, **config_parameters["scheduler_args"]) evaluator.add_event_handler(Events.EPOCH_COMPLETED, train_util.update_lr, scheduler, "score") evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { "model": model, }) trainer.run(train_loader, max_epochs=config_parameters["epochs"]) return outputdir
def train(self, config, **kwargs): """Trains a model on the given configurations. :param config:str: A training configuration. Note that all parameters in the config can also be manually adjusted with --ARG=VALUE :param **kwargs: parameters to overwrite yaml config """ from pycocoevalcap.cider.cider import Cider from pycocoevalcap.spider.spider import Spider conf = train_util.parse_config_or_kwargs(config, **kwargs) conf["seed"] = self.seed zh = conf["zh"] outputdir = os.path.join( conf["outputpath"], conf["modelwrapper"], # "{}_{}".format( # datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'), # uuid.uuid1().hex)) conf["remark"], "seed_{}".format(self.seed) ) # Early init because of creating dir checkpoint_handler = ModelCheckpoint( outputdir, "run", n_saved=1, require_empty=False, create_dir=True, score_function=lambda engine: engine.state.metrics["score"], score_name="score") logger = train_util.genlogger(os.path.join(outputdir, "train.log")) # print passed config parameters logger.info("Storing files in: {}".format(outputdir)) train_util.pprint_dict(conf, logger.info) vocabulary = torch.load(conf["vocab_file"]) train_loader, val_loader, info = self._get_dataloaders(conf, vocabulary) conf["inputdim"] = info["inputdim"] logger.info("<== Estimating Scaler ({}) ==>".format(info["scaler"].__class__.__name__)) logger.info( "Feature: {} Input dimension: {} Vocab Size: {}".format( conf["feature_file"], info["inputdim"], len(vocabulary))) train_key2refs = info["train_key2refs"] val_key2refs = info["val_key2refs"] model = self._get_model(conf, vocabulary) model = model.to(self.device) train_util.pprint_dict(model, logger.info, formatter="pretty") optimizer = getattr( torch.optim, conf["optimizer"] )(model.parameters(), **conf["optimizer_args"]) train_util.pprint_dict(optimizer, logger.info, formatter="pretty") crtrn_imprvd = train_util.criterion_improver(conf["improvecriterion"]) scorer_dict = {"cider": Cider(zh=zh), "spider": Spider()} if "train_scorer" not in conf: conf["train_scorer"] = "cider" train_scorer = scorer_dict[conf["train_scorer"]] def _train_batch(engine, batch): # import pdb; pdb.set_trace() # set num batch tracked? model.train() with torch.enable_grad(): optimizer.zero_grad() # train_scorer = scorer_dict[conf["train_scorer"]] output = self._forward(model, batch, "train", key2refs=train_key2refs, scorer=train_scorer, vocabulary=vocabulary) output["loss"].backward() optimizer.step() return output trainer = Engine(_train_batch) RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss") pbar = ProgressBar(persist=False, ascii=True) pbar.attach(trainer, ["running_loss"]) key2pred = {} def _inference(engine, batch): model.eval() keys = batch[2] with torch.no_grad(): # val_scorer = Cider(zh=zh) # output = self._forward(model, batch, "train", # key2refs=val_key2refs, scorer=val_scorer) # seqs = output["greedy_seqs"].cpu().numpy() output = self._forward(model, batch, "validation") seqs = output["seqs"].cpu().numpy() for idx, seq in enumerate(seqs): if keys[idx] in key2pred: continue candidate = self._convert_idx2sentence(seq, vocabulary, zh=zh) key2pred[keys[idx]] = [candidate,] return output evaluator = Engine(_inference) RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss") metrics = { "loss": Average(output_transform=lambda x: x["loss"]), "reward": Average(output_transform=lambda x: x["reward"].reshape(-1, 1)), # "score": Average(output_transform=lambda x: x["score"].reshape(-1, 1)), } for name, metric in metrics.items(): metric.attach(trainer, name) # metric.attach(evaluator, name) # RunningAverage(output_transform=lambda x: x["loss"]).attach(evaluator, "running_loss") # pbar.attach(evaluator, ["running_loss"]) pbar.attach(evaluator) trainer.add_event_handler( Events.EPOCH_COMPLETED, train_util.log_results, evaluator, val_loader, logger.info, metrics.keys(), ["score"]) def eval_val(engine, key2pred, key2refs, scorer): score, scores = scorer.compute_score(key2refs, key2pred) engine.state.metrics["score"] = score key2pred.clear() evaluator.add_event_handler( Events.EPOCH_COMPLETED, eval_val, key2pred, val_key2refs, Cider(zh=zh)) evaluator.add_event_handler( Events.EPOCH_COMPLETED, train_util.save_model_on_improved, crtrn_imprvd, "score", { "model": model.state_dict(), "config": conf, "scaler": info["scaler"] }, os.path.join(outputdir, "saved.pth")) evaluator.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, { "model": model, } ) trainer.run(train_loader, max_epochs=conf["epochs"]) return outputdir
def train(self, config, **kwargs): """Trains a model on the given configurations. :param config:str: A training configuration. Note that all parameters in the config can also be manually adjusted with --ARG=VALUE :param **kwargs: parameters to overwrite yaml config """ from pycocoevalcap.cider.cider import Cider config_parameters = train_util.parse_config_or_kwargs(config, **kwargs) config_parameters["seed"] = self.seed zh = config_parameters["zh"] outputdir = os.path.join( config_parameters["outputpath"], config_parameters["model"], "{}_{}".format( datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'), uuid.uuid1().hex)) # Early init because of creating dir checkpoint_handler = ModelCheckpoint( outputdir, "run", n_saved=1, require_empty=False, create_dir=True, score_function=lambda engine: -engine.state.metrics["loss"], score_name="loss") logger = train_util.genlogger(os.path.join(outputdir, "train.log")) # print passed config parameters logger.info("Storing files in: {}".format(outputdir)) train_util.pprint_dict(config_parameters, logger.info) vocabulary = torch.load(config_parameters["vocab_file"]) trainloader, cvloader, info = self._get_dataloaders(config_parameters, vocabulary) config_parameters["inputdim"] = info["inputdim"] logger.info("<== Estimating Scaler ({}) ==>".format(info["scaler"].__class__.__name__)) logger.info( "Stream: {} Input dimension: {} Vocab Size: {}".format( config_parameters["feature_stream"], info["inputdim"], len(vocabulary))) train_key2refs = info["train_key2refs"] # train_scorer = BatchCider(train_key2refs) cv_key2refs = info["cv_key2refs"] # cv_scorer = BatchCider(cv_key2refs) model = self._get_model(config_parameters, vocabulary) model = model.to(device) train_util.pprint_dict(model, logger.info, formatter="pretty") optimizer = getattr( torch.optim, config_parameters["optimizer"] )(model.parameters(), **config_parameters["optimizer_args"]) train_util.pprint_dict(optimizer, logger.info, formatter="pretty") # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( # optimizer, **config_parameters["scheduler_args"]) crtrn_imprvd = train_util.criterion_improver(config_parameters["improvecriterion"]) def _train_batch(engine, batch): model.train() with torch.enable_grad(): optimizer.zero_grad() train_scorer = Cider(zh=zh) output = self._forward(model, batch, "train", train_mode="scst", key2refs=train_key2refs, scorer=train_scorer) output["loss"].backward() optimizer.step() return output trainer = Engine(_train_batch) RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss") pbar = ProgressBar(persist=False, ascii=True) pbar.attach(trainer, ["running_loss"]) key2pred = {} def _inference(engine, batch): model.eval() keys = batch[2] with torch.no_grad(): cv_scorer = Cider(zh=zh) output = self._forward(model, batch, "train", train_mode="scst", key2refs=cv_key2refs, scorer=cv_scorer) seqs = output["sampled_seqs"].cpu().numpy() for idx, seq in enumerate(seqs): if keys[idx] in key2pred: continue candidate = self._convert_idx2sentence(seq, vocabulary, zh=zh) key2pred[keys[idx]] = [candidate,] return output evaluator = Engine(_inference) RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss") metrics = { "loss": Average(output_transform=lambda x: x["loss"]), "reward": Average(output_transform=lambda x: x["reward"].reshape(-1, 1)), } for name, metric in metrics.items(): metric.attach(trainer, name) metric.attach(evaluator, name) RunningAverage(output_transform=lambda x: x["loss"]).attach(evaluator, "running_loss") pbar.attach(evaluator, ["running_loss"]) # @trainer.on(Events.STARTED) # def log_initial_result(engine): # evaluator.run(cvloader, max_epochs=1) # logger.info("Initial Results - loss: {:<5.2f}\tscore: {:<5.2f}".format(evaluator.state.metrics["loss"], evaluator.state.metrics["score"].item())) trainer.add_event_handler( Events.EPOCH_COMPLETED, train_util.log_results, evaluator, cvloader, logger.info, metrics.keys(), ["loss", "reward", "score"]) def eval_cv(engine, key2pred, key2refs, scorer): # if len(cv_key2refs) == 0: # for key, _ in key2pred.items(): # cv_key2refs[key] = key2refs[key] score, scores = scorer.compute_score(key2refs, key2pred) engine.state.metrics["score"] = score key2pred.clear() evaluator.add_event_handler( Events.EPOCH_COMPLETED, eval_cv, key2pred, cv_key2refs, Cider(zh=zh)) evaluator.add_event_handler( Events.EPOCH_COMPLETED, train_util.save_model_on_improved, crtrn_imprvd, "score", { "model": model, "config": config_parameters, "scaler": info["scaler"] }, os.path.join(outputdir, "saved.pth")) evaluator.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, { "model": model, } ) trainer.run(trainloader, max_epochs=config_parameters["epochs"]) return outputdir
def train(self, config, **kwargs): """Trains a model on the given configurations. :param config: A training configuration. Note that all parameters in the config can also be manually adjusted with --ARG=VALUE :param **kwargs: parameters to overwrite yaml config """ from pycocoevalcap.cider.cider import Cider conf = train_util.parse_config_or_kwargs(config, **kwargs) conf["seed"] = self.seed assert "distributed" in conf if conf["distributed"]: torch.distributed.init_process_group(backend="nccl") self.local_rank = torch.distributed.get_rank() self.world_size = torch.distributed.get_world_size() assert kwargs["local_rank"] == self.local_rank torch.cuda.set_device(self.local_rank) self.device = torch.device("cuda", self.local_rank) # self.group = torch.distributed.new_group() if not conf["distributed"] or not self.local_rank: outputdir = str( Path(conf["outputpath"]) / conf["model"] / # "{}_{}".format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%m"), # uuid.uuid1().hex) conf["remark"] / "seed_{}".format(self.seed) ) Path(outputdir).mkdir(parents=True, exist_ok=True) # # Early init because of creating dir # checkpoint_handler = ModelCheckpoint( # outputdir, # "run", # n_saved=1, # require_empty=False, # create_dir=False, # score_function=lambda engine: engine.state.metrics["score"], # score_name="score") logger = train_util.genlogger(str(Path(outputdir) / "train.log")) # print passed config parameters if "SLURM_JOB_ID" in os.environ: logger.info("Slurm job id: {}".format(os.environ["SLURM_JOB_ID"])) logger.info("Storing files in: {}".format(outputdir)) train_util.pprint_dict(conf, logger.info) zh = conf["zh"] vocabulary = pickle.load(open(conf["vocab_file"], "rb")) dataloaders = self._get_dataloaders(conf, vocabulary) train_dataloader = dataloaders["train_dataloader"] val_dataloader = dataloaders["val_dataloader"] val_key2refs = dataloaders["val_key2refs"] data_dim = train_dataloader.dataset.data_dim conf["input_dim"] = data_dim if not conf["distributed"] or not self.local_rank: feature_data = conf["h5_csv"] if "h5_csv" in conf else conf["train_h5_csv"] logger.info( "Feature: {} Input dimension: {} Vocab Size: {}".format( feature_data, data_dim, len(vocabulary))) model = self._get_model(conf, len(vocabulary)) model = model.to(self.device) if conf["distributed"]: model = torch.nn.parallel.distributed.DistributedDataParallel( model, device_ids=[self.local_rank,], output_device=self.local_rank, find_unused_parameters=True) optimizer = getattr( torch.optim, conf["optimizer"] )(model.parameters(), **conf["optimizer_args"]) if not conf["distributed"] or not self.local_rank: train_util.pprint_dict(model, logger.info, formatter="pretty") train_util.pprint_dict(optimizer, logger.info, formatter="pretty") if conf["label_smoothing"]: criterion = train_util.LabelSmoothingLoss(len(vocabulary), smoothing=conf["smoothing"]) else: criterion = torch.nn.CrossEntropyLoss().to(self.device) crtrn_imprvd = train_util.criterion_improver(conf['improvecriterion']) def _train_batch(engine, batch): if conf["distributed"]: train_dataloader.sampler.set_epoch(engine.state.epoch) model.train() with torch.enable_grad(): optimizer.zero_grad() output = self._forward( model, batch, "train", ss_ratio=conf["ss_args"]["ss_ratio"] ) loss = criterion(output["packed_logits"], output["targets"]).to(self.device) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), conf["max_grad_norm"]) optimizer.step() output["loss"] = loss.item() return output trainer = Engine(_train_batch) RunningAverage(output_transform=lambda x: x["loss"]).attach(trainer, "running_loss") pbar = ProgressBar(persist=False, ascii=True, ncols=100) pbar.attach(trainer, ["running_loss"]) key2pred = {} def _inference(engine, batch): model.eval() keys = batch[0] with torch.no_grad(): output = self._forward(model, batch, "validation") seqs = output["seqs"].cpu().numpy() for (idx, seq) in enumerate(seqs): candidate = self._convert_idx2sentence(seq, vocabulary, zh) key2pred[keys[idx]] = [candidate,] return output metrics = { "loss": Loss(criterion, output_transform=lambda x: (x["packed_logits"], x["targets"])), "accuracy": Accuracy(output_transform=lambda x: (x["packed_logits"], x["targets"])), } for name, metric in metrics.items(): metric.attach(trainer, name) evaluator = Engine(_inference) def eval_val(engine, key2pred, key2refs): scorer = Cider(zh=zh) score_output = self._eval_prediction(key2refs, key2pred, [scorer]) engine.state.metrics["score"] = score_output["CIDEr"] key2pred.clear() evaluator.add_event_handler( Events.EPOCH_COMPLETED, eval_val, key2pred, val_key2refs) pbar.attach(evaluator) # Learning rate scheduler if "scheduler" in conf: try: scheduler = getattr(torch.optim.lr_scheduler, conf["scheduler"])( optimizer, **conf["scheduler_args"]) except AttributeError: import utils.lr_scheduler if conf["scheduler"] == "ExponentialDecayScheduler": conf["scheduler_args"]["total_iters"] = len(train_dataloader) * conf["epochs"] scheduler = getattr(utils.lr_scheduler, conf["scheduler"])( optimizer, **conf["scheduler_args"]) if scheduler.__class__.__name__ in ["StepLR", "ReduceLROnPlateau", "ExponentialLR", "MultiStepLR"]: evaluator.add_event_handler( Events.EPOCH_COMPLETED, train_util.update_lr, scheduler, "score") else: trainer.add_event_handler( Events.ITERATION_COMPLETED, train_util.update_lr, scheduler, None) # Scheduled sampling if conf["ss"]: trainer.add_event_handler( Events.GET_BATCH_COMPLETED, train_util.update_ss_ratio, conf, len(train_dataloader)) ######################### # Events for main process: mostly logging and saving ######################### if not conf["distributed"] or not self.local_rank: # logging training and validation loss and metrics trainer.add_event_handler( Events.EPOCH_COMPLETED, train_util.log_results, optimizer, evaluator, val_dataloader, logger.info, metrics.keys(), ["score"]) # saving best model evaluator.add_event_handler( Events.EPOCH_COMPLETED, train_util.save_model_on_improved, crtrn_imprvd, "score", { "model": model.state_dict() if not conf["distributed"] else model.module.state_dict(), # "config": conf, "optimizer": optimizer.state_dict(), "lr_scheduler": scheduler.state_dict() }, str(Path(outputdir) / "saved.pth") ) # regular checkpoint checkpoint_handler = ModelCheckpoint( outputdir, "run", n_saved=1, require_empty=False, create_dir=False, score_function=lambda engine: engine.state.metrics["score"], score_name="score") evaluator.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, { "model": model, } ) # dump configuration train_util.store_yaml(conf, str(Path(outputdir) / "config.yaml")) ######################### # Start training ######################### trainer.run(train_dataloader, max_epochs=conf["epochs"]) if not conf["distributed"] or not self.local_rank: return outputdir