Ejemplo n.º 1
0
def main(cfg: DictConfig) -> None:
    logging.info(
        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \
            no DDP to obtain accurate results')

    if not hasattr(cfg.model, 'test_ds'):
        raise ValueError(
            f'model.test_ds was not found in the config, skipping evaluation')
    else:
        gpu = 1 if cfg.trainer.gpus != 0 else 0

    trainer = pl.Trainer(
        gpus=gpu,
        precision=cfg.trainer.precision,
        amp_level=cfg.trainer.amp_level,
        logger=False,
        checkpoint_callback=False,
    )
    exp_dir = exp_manager(trainer, cfg.exp_manager)

    if not cfg.pretrained_model:
        raise ValueError(
            'To run evaluation and inference script a pre-trained model or .nemo file must be provided.'
            f'Choose from {TokenClassificationModel.list_available_models()} or "pretrained_model"="your_model.nemo"'
        )

    if os.path.exists(cfg.pretrained_model):
        model = TokenClassificationModel.restore_from(cfg.pretrained_model)
    elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names(
    ):
        model = TokenClassificationModel.from_pretrained(cfg.pretrained_model)
    else:
        raise ValueError(
            f'Provide path to the pre-trained .nemo checkpoint or choose from {TokenClassificationModel.list_available_models()}'
        )

    data_dir = cfg.model.dataset.get('data_dir', None)
    if data_dir is None:
        logging.error(
            'No dataset directory provided. Skipping evaluation. '
            'To run evaluation on a file, specify path to the directory that contains test_ds.text_file and test_ds.labels_file with "model.dataset.data_dir" argument.'
        )
    elif not os.path.exists(data_dir):
        logging.error(
            f'{data_dir} is not found, skipping evaluation on the test set.')
    else:
        model.update_data_dir(data_dir=data_dir)
        model._cfg.dataset = cfg.model.dataset

        if not hasattr(cfg.model, 'test_ds'):
            logging.error(
                f'model.test_ds was not found in the config, skipping evaluation'
            )
        elif model.prepare_test(trainer):
            model.setup_test_data(cfg.model.test_ds)
            trainer.test(model)
        else:
            logging.error(
                'Skipping the evaluation. The trainer is not setup properly.')

    model.evaluate_from_file(
        text_file=os.path.join(data_dir, cfg.model.test_ds.text_file),
        labels_file=os.path.join(data_dir, cfg.model.test_ds.labels_file),
        output_dir=exp_dir,
        add_confusion_matrix=True,
        normalize_confusion_matrix=True,
    )

    # run an inference on a few examples
    queries = [
        'we bought four shirts from the nvidia gear store in santa clara.',
        'Nvidia is a company.'
    ]
    results = model.add_predictions(queries, output_file='predictions.txt')

    for query, result in zip(queries, results):
        logging.info(f'Query : {query}')
        logging.info(f'Result: {result.strip()}\n')

    logging.info(f'Results are saved at {exp_dir}')
Ejemplo n.º 2
0
def main(cfg: DictConfig) -> None:
    trainer = pl.Trainer(**cfg.trainer)
    exp_dir = exp_manager(trainer, cfg.get("exp_manager", None))
    do_training = True
    if not cfg.pretrained_model:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = TokenClassificationModel(cfg.model, trainer=trainer)
    else:
        logging.info(f'Loading pretrained model {cfg.pretrained_model}')
        model = TokenClassificationModel.from_pretrained(cfg.pretrained_model)

        data_dir = cfg.model.dataset.get('data_dir', None)
        if data_dir:
            # we can also do finetunining of the pretrained model but it will require
            # setting up train and validation Pytorch DataLoaders
            # setup the data dir to get class weights statistics
            model.update_data_dir(data_dir=data_dir)
            # then we're setting up loss, use model.dataset.class_balancing,
            # if you want to add class weights to the CrossEntropyLoss
            model.setup_loss(class_balancing=cfg.model.dataset.class_balancing)
            # finally, setup train and validation Pytorch DataLoaders
            model.setup_training_data()
            model.setup_validation_data()
            logging.info(f'Using config file of the pretrained model')
        else:
            do_training = False
            logging.info(
                f'Data dir should be specified for finetuning the pretrained model. '
                f'Using pretrained {cfg.pretrained_model} model weights and skipping finetuning.'
            )

    if do_training:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)
    """
    After model training is done, you can use the model for inference.
    You can either evaluate data from a text_file that follows training data format,
    or provide a list of queries you want to add entities to
    
    During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU
    and no DDP to obtain accurate results
    """
    logging.info(
        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU '
        'and no DDP to obtain accurate results')
    gpu = 1 if cfg.trainer.gpus != 0 else 0
    trainer = pl.Trainer(gpus=gpu)
    model.set_trainer(trainer)

    if do_training:
        # run evaluation on a dataset from file
        # only possible if model.dataset.data_dir is specified
        # change the path to the file you want to use for the final evaluation
        model.evaluate_from_file(
            text_file=os.path.join(cfg.model.dataset.data_dir,
                                   cfg.model.validation_ds.text_file),
            labels_file=os.path.join(cfg.model.dataset.data_dir,
                                     cfg.model.validation_ds.labels_file),
            output_dir=exp_dir,
            add_confusion_matrix=True,
            normalize_confusion_matrix=True,
        )

    # run an inference on a few examples
    queries = [
        'we bought four shirts from the nvidia gear store in santa clara.',
        'Nvidia is a company.'
    ]
    results = model.add_predictions(queries)

    for query, result in zip(queries, results):
        logging.info(f'Query : {query}')
        logging.info(f'Result: {result.strip()}\n')
Ejemplo n.º 3
0
def main(cfg: DictConfig) -> None:
    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))

    if not cfg.pretrained_model:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = TokenClassificationModel(cfg.model, trainer=trainer)
    else:
        if os.path.exists(cfg.pretrained_model):
            model = TokenClassificationModel.restore_from(cfg.pretrained_model,
                                                          trainer=trainer)
        elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names(
        ):
            model = TokenClassificationModel.from_pretrained(
                cfg.pretrained_model)
        else:
            raise ValueError(
                f'Provide path to the pre-trained .nemo file or choose from {TokenClassificationModel.list_available_models()}'
            )

        data_dir = cfg.model.dataset.get('data_dir', None)
        if data_dir:
            if not os.path.exists(data_dir):
                raise ValueError(f'{data_dir} is not found at')

            # we can also do finetuning of the pretrained model but it will require
            # setup the data dir to get class weights statistics
            model.update_data_dir(data_dir=data_dir)
            # finally, setup train and validation Pytorch DataLoaders
            model.setup_training_data()
            model.setup_validation_data()
            # then we're setting up loss, use model.dataset.class_balancing,
            # if you want to add class weights to the CrossEntropyLoss
            model.setup_loss(class_balancing=cfg.model.dataset.class_balancing)
            logging.info(f'Using config file of the pretrained model')
        else:
            raise ValueError(
                'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \
                with "model.dataset.data_dir" argument')

    trainer.fit(model)

    if cfg.model.nemo_path:
        model.save_to(cfg.model.nemo_path)
        logging.info(f'The model was saved to {cfg.model.nemo_path}')