def run(config): opt_cfg = config["optimizer"] data_cfg = config["data"] model_cfg = config["model"] # Loaders batch_size = opt_cfg["batch_size"] preproc = loader.Preprocessor(data_cfg["train_set"], start_and_end=data_cfg["start_and_end"]) train_ldr = loader.make_loader(data_cfg["train_set"], preproc, batch_size) dev_ldr = loader.make_loader(data_cfg["dev_set"], preproc, batch_size) # Model model_class = eval("models." + model_cfg["class"]) model = model_class(preproc.input_dim, preproc.vocab_size, model_cfg) #model.cuda() if use_cuda else model.cpu() if use_cuda: model.cuda() else: model.cpu() # Optimizer optimizer = torch.optim.SGD(model.parameters(), lr=opt_cfg["learning_rate"], momentum=opt_cfg["momentum"]) run_state = (0, 0) best_so_far = float("inf") for e in range(opt_cfg["epochs"]): start = time.time() run_state = run_epoch(model, optimizer, train_ldr, *run_state) msg = "Epoch {} completed in {:.2f} (s)." print(msg.format(e, time.time() - start)) dev_loss, dev_cer = eval_dev(model, dev_ldr, preproc) # Log for tensorboard tb.log_value("dev_loss", dev_loss, e) tb.log_value("dev_cer", dev_cer, e) speech.save(model, preproc, config["save_path"]) # Save the best model on the dev set if dev_cer < best_so_far: best_so_far = dev_cer speech.save(model, preproc, config["save_path"], tag="best")
def run(config, use_cuda): opt_cfg = config["optimizer"] data_cfg = config["data"] model_cfg = config["model"] aud_cfg = config['audio'] batch_size = opt_cfg["batch_size"] load_pre = True if load_pre: # Todo: add code for checking if pretrained actually exists. If not, init model and rest model, _, preproc = speech.load("ctc_best", tag="best") else: preproc = loader.Preprocessor(data_cfg["train_set"], aud_cfg, start_and_end=data_cfg["start_and_end"]) # eval('print("Hello")') will actually call print("Hello") model_class = eval("models." + model_cfg["class"]) # define model model = model_class(preproc.input_dim, preproc.vocab_size, model_cfg) model = model.cuda() if use_cuda else model.cpu() optimizer = torch.optim.SGD(model.parameters(), lr=opt_cfg["learning_rate"], momentum=opt_cfg["momentum"]) # Dataloader is a subclass of pytorch.utils.dataloader. Can iterate train_ldr = loader.make_loader(data_cfg["train_set"], preproc, batch_size) dev_ldr = loader.make_loader(data_cfg["dev_set"], preproc, batch_size) print("Epochs to train:", opt_cfg["epochs"]) run_state = (0, 0) best_so_far = float("inf") for e in range(opt_cfg["epochs"]): start = time.time() run_state = run_epoch(model, optimizer, train_ldr, *run_state) msg = "Epoch {} completed in {:.2f} (s)." print(msg.format(e, time.time() - start)) if (e % 10 == 0) or (e == (opt_cfg["epochs"] - 1)): dev_loss, dev_cer = eval_dev(model, dev_ldr, preproc) # Log for tensorboard tb.log_value("dev_loss", dev_loss, e) tb.log_value("dev_cer", dev_cer, e) speech.save(model, optimizer, preproc, config["save_path"]) # Save the best model on the dev set if dev_cer < best_so_far: best_so_far = dev_cer speech.save(model, optimizer, preproc, config["save_path"], tag="best")
def run(model_path, dataset_json, batch_size=8, tag="best", out_file=None): use_cuda = torch.cuda.is_available() model, preproc = speech.load(model_path, tag=tag) ldr = loader.make_loader(dataset_json, preproc, batch_size) model.cuda() if use_cuda else model.cpu() model.set_eval() results = eval_loop(model, ldr) results = [(preproc.decode(label), preproc.decode(pred)) for label, pred in results] cer = speech.compute_cer(results) print("CER {:.3f}".format(cer)) if out_file is not None: with open(out_file, 'w') as fid: for label, pred in results: res = {'prediction' : pred, 'label' : label} json.dump(res, fid) fid.write("\n")
def test_loader(): batch_size = 2 data_json = "test.json" preproc = loader.Preprocessor(data_json) ldr = loader.make_loader(data_json, preproc, batch_size, num_workers=0) # Test that batches are properly sorted by size for inputs, labels in ldr: assert inputs[0].shape == inputs[1].shape
def main(model_path: str, json_path: str, use_cuda: bool, log_name: str, use_augmentation: bool): """ runs the eval_dev loop in train continually while saving relevant date to a log file """ # create logger logger = logging.getLogger("eval-dev_log") logger.setLevel(logging.DEBUG) # create file handler which logs even debug messages fh = logging.FileHandler(log_name + ".log") fh.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', "%Y-%m-%d %H:%M:%S") fh.setFormatter(formatter) logger.addHandler(fh) #loading model and preproc model, preproc = speech.load(model_path, tag="best") model.cuda() if use_cuda else model.cpu() print(f"spec_aug status:{preproc.spec_augment}") # creating loader dev_ldr = loader.make_loader(json_path, preproc, batch_size=1) iterations = 500 logger.info("============= Trial info ============") logger.info(f"model path: {model_path}") logger.info(f"json path: {json_path}") logger.info(f"use_augmentation: {use_augmentation}") logger.info(f"preproc: {preproc}") logger.info(f"model: {model}") for i in range(iterations): logger.info(f"\n=================================================\n") logger.info(f"Iteration: {i}") loss, cer = eval_dev(model, dev_ldr, preproc, logger, use_augmentation)
def run_eval( model_path, dataset_json, batch_size=8, tag="best", model_name="model_state_dict.pth", device = None, add_filename=False, add_maxdecode:bool=False, formatted=False, config_path = None, out_file=None)->int: """ calculates the distance between the predictions from the model in model_path and the labels in dataset_json Args: model_path (str): path to the directory that contains the model, dataset_json (str): path to the dataset json file batch_size (int): number of examples to be fed into the model at once tag (str): string that prefixes the model_name. if best, the "best_model" is used model_name (str): name of the model, likely either "model_state_dict.pth" or "model" device (torch.device): device that the evaluation should run on add_filename (bool): if true, the filename is added to each example in `save_json` add_maxdecode (bool): if true, the predictions using max decoding will be added in addition to the predictions from the ctc_decoder formatted (bool): if true, the `format_save` will be used instead of `json_save` where `format_save` outputs a more human-readable output file config_path (bool): specific path to the config file, if the one in `model_path` is not desired out_file (str): path where the output file will be saved Returns: (int): returns the computed error rate of the model on the dataset """ if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path, preproc_path, config_path = get_names(model_path, tag=tag, model_name=model_name, get_config=True) # load and update preproc preproc = read_pickle(preproc_path) preproc.update() # load and assign config config = load_config(config_path) model_cfg = config['model'] model_cfg.update({'blank_idx': config['preproc']['blank_idx']}) # creat `blank_idx` in model_cfg section # create model model = CTC_train( preproc.input_dim, preproc.vocab_size, model_cfg ) state_dict = load_state_dict(model_path, device=device) model.load_state_dict(state_dict) ldr = loader.make_loader( dataset_json, preproc, batch_size ) model.to(device) model.set_eval() print(f"preproc train_status before set_eval: {preproc.train_status}") preproc.set_eval() preproc.use_log = False print(f"preproc train_status after set_eval: {preproc.train_status}") results = eval_loop(model, ldr, device) print(f"number of examples: {len(results)}") #results_dist = [[(preproc.decode(pred[0]), preproc.decode(pred[1]), prob)] # for example_dist in results_dist # for pred, prob in example_dist] results = [(preproc.decode(label), preproc.decode(pred), conf) for label, pred, conf in results] # maxdecode_results = [(preproc.decode(label), preproc.decode(pred)) # for label, pred in results] cer = speech.compute_cer(results, verbose=True) print("PER {:.3f}".format(cer)) if out_file is not None: compile_save(results, dataset_json, out_file, formatted, add_filename) return round(cer, 3)
def run(local_rank: int, config: dict) -> None: """Main function that defines the data, optimizer, and model objects and runs the training and evaluation loops. Args: local_rank (int): rank of the process on the GPU config (dict): training configuration dict """ # unpacking the config data_cfg = config["data"] log_cfg = config["logger"] preproc_cfg = config["preproc"] opt_cfg = config["optimizer"] model_cfg = config["model"] train_cfg = config['training'] ckpt_cfg = config['checkpoint'] gcs_ckpt_handler = GCSCheckpointHandler(ckpt_cfg) # save the config to gcs os.makedirs(ckpt_cfg['local_save_path'], exist_ok=True) with open(os.path.join(ckpt_cfg['local_save_path'], "ctc_config.yaml"), 'w') as fid: yaml.dump(config, fid) gcs_ckpt_handler.upload_to_gcs("ctc_config.yaml") # setting up the distributed training environment dist.init_process_group(backend='nccl') torch.cuda.set_device(local_rank) print( f"local_rank: {local_rank}, dist.get_rank: {torch.distributed.get_rank()}" ) is_rank_0 = (torch.distributed.get_rank() == 0) # defining the logging and debugging modes use_log = log_cfg["use_log"] and is_rank_0 debug_mode = log_cfg["debug_mode"] if debug_mode: torch.autograd.set_detect_anomaly(True) # create a logger, rank_0 boolean is contained in `use_log` logger = get_logger("train_log", log_cfg['log_file'], log_cfg['level']) if use_log else None # creates tensorboardX writer in rank_0 process tbX_writer = SummaryWriter( logdir=ckpt_cfg["local_save_path"]) if is_rank_0 else None # Load previous train state: dict with contents: # {start_epoch: int, run_state: (int, float), best_so_far: float, learning_rate: float} train_state_path = gcs_ckpt_handler.download_from_gcs_bucket( os.path.join(ckpt_cfg['gcs_dir'], "train_state.pickle")) if train_state_path: print(f"load train_state from: {train_state_path}") train_state = read_pickle(train_state_path) # if train_path doesn't exist, create empty dict to load from config else: print(f"load train_state from config") train_state = dict() # the get-statements will load from train_state if key exists, and from opt_cfg otherwise run_state = train_state.get('run_state', opt_cfg['run_state']) best_so_far = train_state.get('best_so_far', opt_cfg['best_so_far']) start_epoch = train_state.get('start_epoch', opt_cfg['start_epoch']) # create the preproc object and data loaders batch_size = opt_cfg["batch_size"] preproc = loader.Preprocessor(data_cfg["train_set"], preproc_cfg, logger, start_and_end=data_cfg["start_and_end"]) train_ldr = loader.make_ddp_loader(data_cfg["train_set"], preproc, batch_size, num_workers=data_cfg["num_workers"]) # create the dev-set loaders in the rank_0 process if is_rank_0: dev_ldr_dict = dict() for dev_name, dev_path in data_cfg["dev_sets"].items(): dev_ldr = loader.make_loader(dev_path, preproc, batch_size=8, num_workers=data_cfg["num_workers"]) dev_ldr_dict.update({dev_name: dev_ldr}) # Model # add the blank_idx to model_cfg model_cfg.update({'blank_idx': preproc_cfg['blank_idx']}) model = CTC_train(preproc.input_dim, preproc.vocab_size, model_cfg) # load a model from checkpoint, if it exists model_ckpt_path = gcs_ckpt_handler.download_from_gcs_bucket( os.path.join(ckpt_cfg['gcs_dir'], "ckpt_model_state_dict.pth")) if model_ckpt_path: model_cfg['local_trained_path'] = model_ckpt_path model = load_from_trained(model, model_cfg) print( f"Succesfully loaded weights from checkpoint: {ckpt_cfg['gcs_dir']}" ) # if a model checkpoint doesn't exist, load from trained if selected and possible else: if model_cfg["load_trained"]: local_trained_path = gcs_ckpt_handler.download_from_gcs_bucket( model_cfg['gcs_trained_path']) if local_trained_path: model_cfg['local_trained_path'] = local_trained_path model = load_from_trained(model, model_cfg) print( f"Succesfully loaded weights from trained model: {model_cfg['gcs_trained_path']}" ) else: print( f"no model found at gcs location: {model_cfg['gcs_trained_path']}" ) else: print("model trained from scratch") # Optimizer and learning rate scheduler learning_rate = opt_cfg['learning_rate'] optimizer = torch.optim.SGD( model.parameters(), lr=learning_rate, # from train_state or opt_config momentum=opt_cfg["momentum"], dampening=opt_cfg["dampening"]) lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=opt_cfg["sched_step"], gamma=opt_cfg["sched_gamma"]) # gradient scaler, too large a value for init_scale produces NaN gradients scaler = GradScaler(enabled=train_cfg['amp'], init_scale=16) # call the ddp wrappers model.cuda(local_rank) model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) if use_log: logger.info( f"train: ====== Model, loaders, optimimzer created =======") logger.info(f"train: model: {model}") logger.info(f"train: preproc: {preproc}") logger.info(f"train: optimizer: {optimizer}") logger.info(f"train: config: {config}") # printing to the output file if is_rank_0: print(f"====== Model, loaders, optimimzer created =======") print(f"model: {model}") print(f"preproc: {preproc}") print(f"optimizer: {optimizer}") print(f"config: {config}") # training loop for epoch in range(start_epoch, opt_cfg["epochs"]): start = time.time() for group in optimizer.param_groups: if is_rank_0: print(f'learning rate: {group["lr"]}') if use_log: logger.info(f"train: learning rate: {group['lr']}") try: run_state = run_epoch(model, optimizer, train_ldr, logger, debug_mode, tbX_writer, *run_state, local_rank, train_cfg['loss_name'], ckpt_cfg['local_save_path'], gcs_ckpt_handler, scaler) except Exception as err: if use_log: logger.error(f"Exception raised: {err}") logger.error(f"train: ====In except block====") logger.error(f"train: state_dict: {model.module.state_dict()}") log_model_grads(model.module.named_parameters(), logger) raise Exception('Failure in run_epoch').with_traceback( err.__traceback__) finally: # used to ensure that plots are closed even if exception raised plt.close('all') # update the learning rate lr_scheduler.step() if use_log: logger.info(f"train: ====== Run_state finished =======") logger.info(f"train: preproc type: {type(preproc)}") if is_rank_0: msg = "Epoch {} completed in {:.2f} (hr)." epoch_time_hr = (time.time() - start) / 60 / 60 print(msg.format(epoch, epoch_time_hr)) if use_log: logger.info(msg.format(epoch, epoch_time_hr)) tbX_writer.add_scalars('train/stats', {"epoch_time_hr": epoch_time_hr}, epoch) # the logger needs to be removed to save the model if use_log: preproc.logger = None speech.save(model.module, preproc, ckpt_cfg["local_save_path"]) gcs_ckpt_handler.upload_to_gcs("model_state_dict.pth") gcs_ckpt_handler.upload_to_gcs("preproc.pyc") if use_log: logger.info(f"train: ====== model saved =======") preproc.logger = logger # creating the dictionaries that hold the PER and loss values dev_loss_dict = dict() dev_per_dict = dict() # iterating through the dev-set loaders to calculate the PER/loss for dev_name, dev_ldr in dev_ldr_dict.items(): print(f"evaluating devset: {dev_name}") if use_log: logger.info(f"train: === evaluating devset: {dev_name} ==") dev_loss, dev_per = eval_dev(model.module, dev_ldr, preproc, logger, train_cfg['loss_name']) dev_loss_dict.update({dev_name: dev_loss}) dev_per_dict.update({dev_name: dev_per}) if use_log: logger.info( f"train: ====== eval_dev {dev_name} finished =======") # Save the best model on the dev set if dev_name == data_cfg['dev_set_save_reference']: print( f"dev_reference {dev_name}: current PER: {dev_per} vs. best_so_far: {best_so_far}" ) if use_log: logger.info( f"dev_reference {dev_name}: current PER: {dev_per} vs. best_so_far: {best_so_far}" ) if dev_per < best_so_far: if use_log: preproc.logger = None # remove the logger to save the model best_so_far = dev_per speech.save(model.module, preproc, ckpt_cfg["local_save_path"], tag="best") gcs_ckpt_handler.upload_to_gcs( "best_model_state_dict.pth") gcs_ckpt_handler.upload_to_gcs("best_preproc.pyc") if use_log: preproc.logger = logger logger.info( f"model saved based per on: {dev_name} dataset" ) print( f"UPDATED: best_model based on PER {best_so_far} for {dev_name} devset" ) per_diff_dict = calc_per_difference(dev_per_dict) tbX_writer.add_scalars('dev/loss', dev_loss_dict, epoch) tbX_writer.add_scalars('dev/per', dev_per_dict, epoch) tbX_writer.add_scalars('dev/per/diff', per_diff_dict, epoch) gcs_ckpt_handler.upload_tensorboard_ckpt() learning_rate = list(optimizer.param_groups)[0]["lr"] # save the current state of training train_state = { "start_epoch": epoch + 1, "run_state": run_state, "best_so_far": best_so_far, "learning_rate": learning_rate } write_pickle( os.path.join(ckpt_cfg["local_save_path"], "train_state.pickle"), train_state) gcs_ckpt_handler.upload_to_gcs("train_state.pickle")