Example #1
0
def run(config):

    opt_cfg = config["optimizer"]
    data_cfg = config["data"]
    model_cfg = config["model"]

    # Loaders
    batch_size = opt_cfg["batch_size"]
    preproc = loader.Preprocessor(data_cfg["train_set"],
                  start_and_end=data_cfg["start_and_end"])
    train_ldr = loader.make_loader(data_cfg["train_set"],
                        preproc, batch_size)
    dev_ldr = loader.make_loader(data_cfg["dev_set"],
                        preproc, batch_size)

    # Model
    model_class = eval("models." + model_cfg["class"])
    model = model_class(preproc.input_dim,
                        preproc.vocab_size,
                        model_cfg)
    #model.cuda() if use_cuda else model.cpu()
    if use_cuda:
        model.cuda()
    else:
        model.cpu()
    # Optimizer
    optimizer = torch.optim.SGD(model.parameters(),
                    lr=opt_cfg["learning_rate"],
                    momentum=opt_cfg["momentum"])

    run_state = (0, 0)
    best_so_far = float("inf")
    for e in range(opt_cfg["epochs"]):
        start = time.time()

        run_state = run_epoch(model, optimizer, train_ldr, *run_state)

        msg = "Epoch {} completed in {:.2f} (s)."
        print(msg.format(e, time.time() - start))

        dev_loss, dev_cer = eval_dev(model, dev_ldr, preproc)

        # Log for tensorboard
        tb.log_value("dev_loss", dev_loss, e)
        tb.log_value("dev_cer", dev_cer, e)

        speech.save(model, preproc, config["save_path"])

        # Save the best model on the dev set
        if dev_cer < best_so_far:
            best_so_far = dev_cer
            speech.save(model, preproc,
                    config["save_path"], tag="best")
Example #2
0
def run(config, use_cuda):
    opt_cfg = config["optimizer"]
    data_cfg = config["data"]
    model_cfg = config["model"]
    aud_cfg = config['audio']
    batch_size = opt_cfg["batch_size"]

    load_pre = True

    if load_pre:
        # Todo: add code for checking if pretrained actually exists. If not, init model and rest
        model, _, preproc = speech.load("ctc_best", tag="best")
    else:
        preproc = loader.Preprocessor(data_cfg["train_set"], aud_cfg, start_and_end=data_cfg["start_and_end"])
        # eval('print("Hello")') will actually call print("Hello")
        model_class = eval("models." + model_cfg["class"])
        # define model
        model = model_class(preproc.input_dim, preproc.vocab_size, model_cfg)

    model = model.cuda() if use_cuda else model.cpu()
    optimizer = torch.optim.SGD(model.parameters(), lr=opt_cfg["learning_rate"],
                                momentum=opt_cfg["momentum"])
    # Dataloader is a subclass of pytorch.utils.dataloader. Can iterate
    train_ldr = loader.make_loader(data_cfg["train_set"], preproc, batch_size)
    dev_ldr = loader.make_loader(data_cfg["dev_set"], preproc, batch_size)

    print("Epochs to train:", opt_cfg["epochs"])
    run_state = (0, 0)
    best_so_far = float("inf")
    for e in range(opt_cfg["epochs"]):
        start = time.time()

        run_state = run_epoch(model, optimizer, train_ldr, *run_state)

        msg = "Epoch {} completed in {:.2f} (s)."
        print(msg.format(e, time.time() - start))
        if (e % 10 == 0) or (e == (opt_cfg["epochs"] - 1)):
            dev_loss, dev_cer = eval_dev(model, dev_ldr, preproc)

            # Log for tensorboard
            tb.log_value("dev_loss", dev_loss, e)
            tb.log_value("dev_cer", dev_cer, e)

        speech.save(model, optimizer, preproc, config["save_path"])

        # Save the best model on the dev set
        if dev_cer < best_so_far:
            best_so_far = dev_cer
            speech.save(model, optimizer, preproc, config["save_path"], tag="best")
Example #3
0
def run(model_path, dataset_json,
        batch_size=8, tag="best",
        out_file=None):

    use_cuda = torch.cuda.is_available()

    model, preproc = speech.load(model_path, tag=tag)
    ldr = loader.make_loader(dataset_json,
            preproc, batch_size)

    model.cuda() if use_cuda else model.cpu()
    model.set_eval()

    results = eval_loop(model, ldr)
    results = [(preproc.decode(label), preproc.decode(pred))
               for label, pred in results]
    cer = speech.compute_cer(results)
    print("CER {:.3f}".format(cer))

    if out_file is not None:
        with open(out_file, 'w') as fid:
            for label, pred in results:
                res = {'prediction' : pred,
                       'label' : label}
                json.dump(res, fid)
                fid.write("\n")
Example #4
0
def test_loader():

    batch_size = 2
    data_json = "test.json"
    preproc = loader.Preprocessor(data_json)
    ldr = loader.make_loader(data_json, preproc, batch_size, num_workers=0)

    # Test that batches are properly sorted by size
    for inputs, labels in ldr:
        assert inputs[0].shape == inputs[1].shape
Example #5
0
def main(model_path: str, json_path: str, use_cuda: bool, log_name: str,
         use_augmentation: bool):
    """
    runs the eval_dev loop in train continually while saving
    relevant date to a log file
    """

    # create logger
    logger = logging.getLogger("eval-dev_log")
    logger.setLevel(logging.DEBUG)
    # create file handler which logs even debug messages
    fh = logging.FileHandler(log_name + ".log")
    fh.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s',
                                  "%Y-%m-%d %H:%M:%S")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    #loading model and preproc
    model, preproc = speech.load(model_path, tag="best")
    model.cuda() if use_cuda else model.cpu()
    print(f"spec_aug status:{preproc.spec_augment}")
    # creating loader
    dev_ldr = loader.make_loader(json_path, preproc, batch_size=1)

    iterations = 500

    logger.info("============= Trial info ============")
    logger.info(f"model path: {model_path}")
    logger.info(f"json path: {json_path}")
    logger.info(f"use_augmentation: {use_augmentation}")
    logger.info(f"preproc: {preproc}")
    logger.info(f"model: {model}")

    for i in range(iterations):
        logger.info(f"\n=================================================\n")
        logger.info(f"Iteration: {i}")

        loss, cer = eval_dev(model, dev_ldr, preproc, logger, use_augmentation)
Example #6
0
def run_eval(
        model_path, 
        dataset_json, 
        batch_size=8, 
        tag="best", 
        model_name="model_state_dict.pth",
        device = None,
        add_filename=False, 
        add_maxdecode:bool=False, 
        formatted=False, 
        config_path = None, 
        out_file=None)->int:
    """
    calculates the  distance between the predictions from
    the model in model_path and the labels in dataset_json

    Args:
        model_path (str): path to the directory that contains the model,
        dataset_json (str): path to the dataset json file
        batch_size (int): number of examples to be fed into the model at once
        tag (str): string that prefixes the model_name.  if best,  the "best_model" is used
        model_name (str): name of the model, likely either "model_state_dict.pth" or "model"
        device (torch.device): device that the evaluation should run on
        add_filename (bool): if true, the filename is added to each example in `save_json`
        add_maxdecode (bool): if true, the predictions using max decoding will be added in addition 
            to the predictions from the ctc_decoder
        formatted (bool): if true, the `format_save` will be used instead of `json_save` where 
            `format_save` outputs a more human-readable output file
        config_path (bool): specific path to the config file, if the one in `model_path` is not desired
        out_file (str): path where the output file will be saved
    
    Returns:
        (int): returns the computed error rate of the model on the dataset
    """

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_path, preproc_path, config_path = get_names(model_path, tag=tag, model_name=model_name, get_config=True)
    
    # load and update preproc
    preproc = read_pickle(preproc_path)
    preproc.update()

    # load and assign config
    config = load_config(config_path)
    model_cfg = config['model']
    model_cfg.update({'blank_idx': config['preproc']['blank_idx']}) # creat `blank_idx` in model_cfg section


    # create model
    model = CTC_train(
        preproc.input_dim,
        preproc.vocab_size,
        model_cfg
    )

    state_dict = load_state_dict(model_path, device=device)
    model.load_state_dict(state_dict)
    
    ldr =  loader.make_loader(
        dataset_json,
        preproc, 
        batch_size
    )
    
    model.to(device)
    model.set_eval()
    print(f"preproc train_status before set_eval: {preproc.train_status}")
    preproc.set_eval()
    preproc.use_log = False
    print(f"preproc train_status after set_eval: {preproc.train_status}")


    results = eval_loop(model, ldr, device)
    print(f"number of examples: {len(results)}")
    #results_dist = [[(preproc.decode(pred[0]), preproc.decode(pred[1]), prob)] 
    #                for example_dist in results_dist
    #                for pred, prob in example_dist]
    results = [(preproc.decode(label), preproc.decode(pred), conf)
               for label, pred, conf in results]
    # maxdecode_results = [(preproc.decode(label), preproc.decode(pred))
    #           for label, pred in results]
    cer = speech.compute_cer(results, verbose=True)

    print("PER {:.3f}".format(cer))
    
    if out_file is not None:
        compile_save(results, dataset_json, out_file, formatted, add_filename)
    
    return round(cer, 3)
Example #7
0
def run(local_rank: int, config: dict) -> None:
    """Main function that defines the data, optimizer, and model objects and runs the training
    and evaluation loops.

    Args:
        local_rank (int): rank of the process on the GPU
        config (dict): training configuration dict
    """
    # unpacking the config
    data_cfg = config["data"]
    log_cfg = config["logger"]
    preproc_cfg = config["preproc"]
    opt_cfg = config["optimizer"]
    model_cfg = config["model"]
    train_cfg = config['training']
    ckpt_cfg = config['checkpoint']

    gcs_ckpt_handler = GCSCheckpointHandler(ckpt_cfg)

    # save the config to gcs
    os.makedirs(ckpt_cfg['local_save_path'], exist_ok=True)
    with open(os.path.join(ckpt_cfg['local_save_path'], "ctc_config.yaml"),
              'w') as fid:
        yaml.dump(config, fid)
    gcs_ckpt_handler.upload_to_gcs("ctc_config.yaml")

    # setting up the distributed training environment
    dist.init_process_group(backend='nccl')
    torch.cuda.set_device(local_rank)
    print(
        f"local_rank: {local_rank}, dist.get_rank: {torch.distributed.get_rank()}"
    )
    is_rank_0 = (torch.distributed.get_rank() == 0)

    # defining the logging and debugging modes
    use_log = log_cfg["use_log"] and is_rank_0
    debug_mode = log_cfg["debug_mode"]
    if debug_mode: torch.autograd.set_detect_anomaly(True)

    # create a logger, rank_0 boolean is contained in `use_log`
    logger = get_logger("train_log", log_cfg['log_file'],
                        log_cfg['level']) if use_log else None

    # creates tensorboardX writer in rank_0 process
    tbX_writer = SummaryWriter(
        logdir=ckpt_cfg["local_save_path"]) if is_rank_0 else None

    # Load previous train state: dict with contents:
    # {start_epoch: int, run_state: (int, float), best_so_far: float, learning_rate: float}
    train_state_path = gcs_ckpt_handler.download_from_gcs_bucket(
        os.path.join(ckpt_cfg['gcs_dir'], "train_state.pickle"))
    if train_state_path:
        print(f"load train_state from: {train_state_path}")
        train_state = read_pickle(train_state_path)
    # if train_path doesn't exist, create empty dict to load from config
    else:
        print(f"load train_state from config")
        train_state = dict()

    # the get-statements will load from train_state if key exists, and from opt_cfg otherwise
    run_state = train_state.get('run_state', opt_cfg['run_state'])
    best_so_far = train_state.get('best_so_far', opt_cfg['best_so_far'])
    start_epoch = train_state.get('start_epoch', opt_cfg['start_epoch'])

    # create the preproc object and data loaders
    batch_size = opt_cfg["batch_size"]
    preproc = loader.Preprocessor(data_cfg["train_set"],
                                  preproc_cfg,
                                  logger,
                                  start_and_end=data_cfg["start_and_end"])

    train_ldr = loader.make_ddp_loader(data_cfg["train_set"],
                                       preproc,
                                       batch_size,
                                       num_workers=data_cfg["num_workers"])

    # create the dev-set loaders in the rank_0 process
    if is_rank_0:
        dev_ldr_dict = dict()
        for dev_name, dev_path in data_cfg["dev_sets"].items():
            dev_ldr = loader.make_loader(dev_path,
                                         preproc,
                                         batch_size=8,
                                         num_workers=data_cfg["num_workers"])
            dev_ldr_dict.update({dev_name: dev_ldr})

    # Model
    # add the blank_idx to model_cfg
    model_cfg.update({'blank_idx': preproc_cfg['blank_idx']})
    model = CTC_train(preproc.input_dim, preproc.vocab_size, model_cfg)

    # load a model from checkpoint, if it exists
    model_ckpt_path = gcs_ckpt_handler.download_from_gcs_bucket(
        os.path.join(ckpt_cfg['gcs_dir'], "ckpt_model_state_dict.pth"))
    if model_ckpt_path:
        model_cfg['local_trained_path'] = model_ckpt_path
        model = load_from_trained(model, model_cfg)
        print(
            f"Succesfully loaded weights from checkpoint: {ckpt_cfg['gcs_dir']}"
        )
    # if a model checkpoint doesn't exist, load from trained if selected and possible
    else:
        if model_cfg["load_trained"]:
            local_trained_path = gcs_ckpt_handler.download_from_gcs_bucket(
                model_cfg['gcs_trained_path'])
            if local_trained_path:
                model_cfg['local_trained_path'] = local_trained_path
                model = load_from_trained(model, model_cfg)
                print(
                    f"Succesfully loaded weights from trained model: {model_cfg['gcs_trained_path']}"
                )
            else:
                print(
                    f"no model found at gcs location: {model_cfg['gcs_trained_path']}"
                )

        else:
            print("model trained from scratch")

    # Optimizer and learning rate scheduler
    learning_rate = opt_cfg['learning_rate']
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=learning_rate,  # from train_state or opt_config
        momentum=opt_cfg["momentum"],
        dampening=opt_cfg["dampening"])

    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt_cfg["sched_step"],
        gamma=opt_cfg["sched_gamma"])

    # gradient scaler, too large a value for init_scale produces NaN gradients
    scaler = GradScaler(enabled=train_cfg['amp'], init_scale=16)

    # call the ddp wrappers
    model.cuda(local_rank)
    model = nn.parallel.DistributedDataParallel(model,
                                                device_ids=[local_rank],
                                                output_device=local_rank)

    if use_log:
        logger.info(
            f"train: ====== Model, loaders, optimimzer created =======")
        logger.info(f"train: model: {model}")
        logger.info(f"train: preproc: {preproc}")
        logger.info(f"train: optimizer: {optimizer}")
        logger.info(f"train: config: {config}")

    # printing to the output file
    if is_rank_0:
        print(f"====== Model, loaders, optimimzer created =======")
        print(f"model: {model}")
        print(f"preproc: {preproc}")
        print(f"optimizer: {optimizer}")
        print(f"config: {config}")

    # training loop
    for epoch in range(start_epoch, opt_cfg["epochs"]):

        start = time.time()
        for group in optimizer.param_groups:
            if is_rank_0: print(f'learning rate: {group["lr"]}')
            if use_log: logger.info(f"train: learning rate: {group['lr']}")

        try:
            run_state = run_epoch(model, optimizer, train_ldr, logger,
                                  debug_mode, tbX_writer, *run_state,
                                  local_rank, train_cfg['loss_name'],
                                  ckpt_cfg['local_save_path'],
                                  gcs_ckpt_handler, scaler)
        except Exception as err:
            if use_log:
                logger.error(f"Exception raised: {err}")
                logger.error(f"train: ====In except block====")
                logger.error(f"train: state_dict: {model.module.state_dict()}")
                log_model_grads(model.module.named_parameters(), logger)
            raise Exception('Failure in run_epoch').with_traceback(
                err.__traceback__)
        finally:  # used to ensure that plots are closed even if exception raised
            plt.close('all')

        # update the learning rate
        lr_scheduler.step()

        if use_log:
            logger.info(f"train: ====== Run_state finished =======")
            logger.info(f"train: preproc type: {type(preproc)}")
        if is_rank_0:
            msg = "Epoch {} completed in {:.2f} (hr)."
            epoch_time_hr = (time.time() - start) / 60 / 60
            print(msg.format(epoch, epoch_time_hr))
            if use_log: logger.info(msg.format(epoch, epoch_time_hr))
            tbX_writer.add_scalars('train/stats',
                                   {"epoch_time_hr": epoch_time_hr}, epoch)

            # the logger needs to be removed to save the model
            if use_log: preproc.logger = None
            speech.save(model.module, preproc, ckpt_cfg["local_save_path"])
            gcs_ckpt_handler.upload_to_gcs("model_state_dict.pth")
            gcs_ckpt_handler.upload_to_gcs("preproc.pyc")

            if use_log:
                logger.info(f"train: ====== model saved =======")
                preproc.logger = logger

            # creating the dictionaries that hold the PER and loss values
            dev_loss_dict = dict()
            dev_per_dict = dict()
            # iterating through the dev-set loaders to calculate the PER/loss
            for dev_name, dev_ldr in dev_ldr_dict.items():
                print(f"evaluating devset: {dev_name}")
                if use_log:
                    logger.info(f"train: === evaluating devset: {dev_name} ==")
                dev_loss, dev_per = eval_dev(model.module, dev_ldr, preproc,
                                             logger, train_cfg['loss_name'])

                dev_loss_dict.update({dev_name: dev_loss})
                dev_per_dict.update({dev_name: dev_per})

                if use_log:
                    logger.info(
                        f"train: ====== eval_dev {dev_name} finished =======")

                # Save the best model on the dev set
                if dev_name == data_cfg['dev_set_save_reference']:
                    print(
                        f"dev_reference {dev_name}: current PER: {dev_per} vs. best_so_far: {best_so_far}"
                    )

                    if use_log:
                        logger.info(
                            f"dev_reference {dev_name}: current PER: {dev_per} vs. best_so_far: {best_so_far}"
                        )
                    if dev_per < best_so_far:
                        if use_log:
                            preproc.logger = None  # remove the logger to save the model
                        best_so_far = dev_per
                        speech.save(model.module,
                                    preproc,
                                    ckpt_cfg["local_save_path"],
                                    tag="best")
                        gcs_ckpt_handler.upload_to_gcs(
                            "best_model_state_dict.pth")
                        gcs_ckpt_handler.upload_to_gcs("best_preproc.pyc")

                        if use_log:
                            preproc.logger = logger
                            logger.info(
                                f"model saved based per on: {dev_name} dataset"
                            )

                        print(
                            f"UPDATED: best_model based on PER {best_so_far} for {dev_name} devset"
                        )

            per_diff_dict = calc_per_difference(dev_per_dict)

            tbX_writer.add_scalars('dev/loss', dev_loss_dict, epoch)
            tbX_writer.add_scalars('dev/per', dev_per_dict, epoch)
            tbX_writer.add_scalars('dev/per/diff', per_diff_dict, epoch)
            gcs_ckpt_handler.upload_tensorboard_ckpt()

            learning_rate = list(optimizer.param_groups)[0]["lr"]
            # save the current state of training
            train_state = {
                "start_epoch": epoch + 1,
                "run_state": run_state,
                "best_so_far": best_so_far,
                "learning_rate": learning_rate
            }
            write_pickle(
                os.path.join(ckpt_cfg["local_save_path"],
                             "train_state.pickle"), train_state)
            gcs_ckpt_handler.upload_to_gcs("train_state.pickle")