Exemple #1
0
def build_trainer(
    config,
    lr: float,
    serialization_dir: str,
    num_epochs: int,
    model: Model,
    train_loader: DataLoader,
    dev_loader: DataLoader) -> Trainer:

    parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=lr)
    if torch.cuda.is_available():
        model.cuda()

    # remove serialization dir
    if os.path.exists(serialization_dir) and config.shutil_pre_finished_experiment:
        shutil.rmtree(serialization_dir)

    if not os.path.exists(serialization_dir):
        os.makedirs(serialization_dir)

    trainer = GradientDescentTrainer(
        model=model,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=num_epochs,
        optimizer=optimizer,
        serialization_dir=serialization_dir,
        cuda_device=0 if torch.cuda.is_available() else -1
    )

    return trainer
Exemple #2
0
def train(
    model: Model,
    binary_class: str,
    train_data: DatasetType,
    valid_reader: DatasetReader,
    vocab: Vocabulary,
    optimizer_type: str,
    optimizer_learning_rate: float,
    optimizer_weight_decay: float,
    batch_size: int,
    patience: int,
    num_epochs: int,
    device: str,
) -> Tuple[Model, MetricsType]:
    train_reader = BIODatasetReader(
        ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class),
        token_indexers={
            'tokens': ELMoTokenCharactersIndexer(),
        },
    )

    train_dataset = train_reader.read('tmp.txt')
    valid_dataset = valid_reader.read('tmp.txt')

    cuda_device = -1

    if device == 'cuda':
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.SGD(
        model.parameters(),
        lr=optimizer_learning_rate,
        weight_decay=optimizer_weight_decay,
    )

    iterator = BucketIterator(
        batch_size=batch_size,
        sorting_keys=[("sentence", "num_tokens")],
    )

    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        patience=patience,
        num_epochs=num_epochs,
        cuda_device=cuda_device,
        validation_metric='f1-measure-overall',
    )
    metrics = trainer.train()

    return model, metrics
def build_trainer(
    config,
    model: Model,
    train_loader: DataLoader,
    dev_loader: DataLoader,
) -> Trainer:
    parameters = [(n, p) for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=config.lr)  # type: ignore
    model.cuda()
    trainer = GradientDescentTrainer(
        model=model,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=config.num_epochs,
        optimizer=optimizer,
        cuda_device=0,
        serialization_dir=config.serialization_dir)
    return trainer
Exemple #4
0
    def run(  # type: ignore
        self,
        model: Model,
        dataset: DatasetDict,
        split: str = "validation",
        data_loader: Optional[Lazy[TangoDataLoader]] = None,
    ) -> EvaluationResult:
        """
        Runs an evaluation on a dataset.

        * `model` is the model we want to evaluate.
        * `dataset` is the dataset we want to evaluate on.
        * `split` is the name of the split we want to evaluate on.
        * `data_loader` gives you the chance to choose a custom dataloader for the evaluation.
          By default this step evaluates on batches of 32 instances each.
        """

        concrete_data_loader: TangoDataLoader
        if data_loader is None:
            concrete_data_loader = BatchSizeDataLoader(dataset.splits[split],
                                                       batch_size=32,
                                                       shuffle=False)
        else:
            concrete_data_loader = data_loader.construct(
                instances=dataset.splits[split])

        if torch.cuda.device_count() > 0:
            model = model.cuda()
            cuda_device = torch.device(0)
        else:
            cuda_device = torch.device("cpu")

        generator_tqdm = Tqdm.tqdm(iter(concrete_data_loader))

        # Number of batches in instances.
        predictions: List[Dict[str, Any]] = []
        # Number of batches where the model produces a loss.
        loss_count = 0
        batch_count = 0
        # Cumulative loss
        total_loss = 0.0

        with torch.inference_mode():
            model.eval()

            for batch in concrete_data_loader:
                batch_count += 1
                batch = move_to_device(batch, cuda_device)
                output_dict = model(**batch)

                metrics = model.get_metrics()

                loss = output_dict.pop("loss", None)
                if loss is not None:
                    loss_count += 1
                    total_loss += loss.item()
                    metrics["loss"] = total_loss / loss_count

                    if any(
                            metric_name.startswith("_")
                            for metric_name in metrics):
                        self.logger.warning_once(
                            'Metrics with names beginning with "_" will '
                            "not be logged to the tqdm progress bar.")

                    description = (", ".join([
                        "%s: %.2f" % (name, value)
                        for name, value in metrics.items()
                        if not name.startswith("_")
                    ]) + " ||")
                    generator_tqdm.set_description(description, refresh=False)

                output_dict = sanitize(output_dict)

                # This is write-only code, but it's quite fast.
                predictions.extend(
                    dict(zip(output_dict.keys(), x))
                    for x in zip(*output_dict.values()))

            final_metrics = model.get_metrics(reset=True)

        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes produced a loss!"
                )
            final_metrics["loss"] = total_loss / loss_count

        return self.EvaluationResult(final_metrics, predictions)
Exemple #5
0
def _from_params(
        cls,  # type: ignore
        model: Model,
        serialization_dir: str,
        iterator: DataIterator,
        train_data: Iterable[Instance],
        validation_data: Optional[Iterable[Instance]],
        params: Params,
        validation_iterator: DataIterator = None) -> DecompTrainer:
    # pylint: disable=arguments-differ
    patience = params.pop_int("patience", None)
    validation_metric = params.pop("validation_metric", "-loss")
    shuffle = params.pop_bool("shuffle", True)

    num_epochs = params.pop_int("num_epochs", 20)

    cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
    grad_norm = params.pop_float("grad_norm", None)
    grad_clipping = params.pop_float("grad_clipping", None)
    lr_scheduler_params = params.pop("learning_rate_scheduler", None)
    momentum_scheduler_params = params.pop("momentum_scheduler", None)

    validation_data_path = params.pop("validation_data_path", None)
    validation_prediction_path = params.pop("validation_prediction_path", None)

    semantics_only = params.pop("semantics_only", False)
    drop_syntax = params.pop("drop_syntax", True)
    include_attribute_scores = params.pop("include_attribute_scores", False)

    warmup_epochs = params.pop("warmup_epochs", 0)

    if isinstance(cuda_device, list):
        model_device = cuda_device[0]
    else:
        model_device = cuda_device
    if model_device >= 0:
        # Moving model to GPU here so that the optimizer state gets constructed on
        # the right device.
        model = model.cuda(model_device)

    bert_optim_params = params.pop("bert_optimizer", None)
    bert_name = "_bert_encoder"

    if bert_optim_params is not None:
        tune_after_layer_num = params.pop("bert_tune_layer", 12)

        frozen_regex_str = [
            "(_bert_encoder\.bert_model\.embeddings.*)",
            "(_bert_encoder\.bert_model\.pooler.*)"
        ]
        tune_regex_str = []
        for i in range(0, 12):
            # match all numbers greater than layer num via disjunction
            tune_regex_one = f"({bert_name}\.bert_model\.encoder\.layer\.{i}\..*)"
            if i >= tune_after_layer_num:
                tune_regex_str.append(tune_regex_one)
            else:
                frozen_regex_str.append(tune_regex_one)
        tune_regex = re.compile("|".join(tune_regex_str))
        frozen_regex = re.compile("|".join(frozen_regex_str))
        # decide which params require grad for which optimizer
        all_names = [n for n, p in model.named_parameters()]
        tune_bert_names = [
            n for n in all_names if tune_regex.match(n) is not None
        ]
        frozen_names = [
            n for n in all_names if frozen_regex.match(n) is not None
        ]
        # assert that they're disjoint
        assert (len(set(frozen_names) & set(tune_bert_names)) == 0)
        # set tunable params to require gradient, frozen ones to not require
        for i, (n, p) in enumerate(model.named_parameters()):
            if n in frozen_names:
                p.requires_grad = False
            else:
                p.requires_grad = True

        # extract BERT
        bert_params = [[n, p] for n, p in model.named_parameters()
                       if p.requires_grad and n in tune_bert_names]
        # make sure this matches the tuneable bert params
        assert ([x[0] for x in bert_params] == tune_bert_names)
        bert_optimizer = Optimizer.from_params(bert_params, bert_optim_params)
    else:
        # freeze all BERT params
        tune_bert_names = []
        bert_optimizer = None
        for i, (n, p) in enumerate(model.named_parameters()):
            if "_bert_encoder" in n:
                p.requires_grad = False

    # model params
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad and n not in tune_bert_names]
    optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
    if "moving_average" in params:
        moving_average = MovingAverage.from_params(
            params.pop("moving_average"), parameters=parameters)
    else:
        moving_average = None

    if lr_scheduler_params:
        lr_scheduler = LearningRateScheduler.from_params(
            optimizer, lr_scheduler_params)
    else:
        lr_scheduler = None
    if momentum_scheduler_params:
        momentum_scheduler = MomentumScheduler.from_params(
            optimizer, momentum_scheduler_params)
    else:
        momentum_scheduler = None

    if 'checkpointer' in params:
        if 'keep_serialized_model_every_num_seconds' in params or \
                'num_serialized_models_to_keep' in params:
            raise ConfigurationError(
                "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                " but the passed config uses both methods.")
        checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
    else:
        num_serialized_models_to_keep = params.pop_int(
            "num_serialized_models_to_keep", 20)
        keep_serialized_model_every_num_seconds = params.pop_int(
            "keep_serialized_model_every_num_seconds", None)
        checkpointer = Checkpointer(
            serialization_dir=serialization_dir,
            num_serialized_models_to_keep=num_serialized_models_to_keep,
            keep_serialized_model_every_num_seconds=
            keep_serialized_model_every_num_seconds)
    model_save_interval = params.pop_float("model_save_interval", None)
    summary_interval = params.pop_int("summary_interval", 100)
    histogram_interval = params.pop_int("histogram_interval", None)
    should_log_parameter_statistics = params.pop_bool(
        "should_log_parameter_statistics", True)
    should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                               False)
    log_batch_size_period = params.pop_int("log_batch_size_period", None)
    syntactic_method = params.pop("syntactic_method", None)
    accumulate_batches = params.pop("accumulate_batches", 1)

    params.assert_empty(cls.__name__)
    return cls(model=model,
               optimizer=optimizer,
               bert_optimizer=bert_optimizer,
               iterator=iterator,
               train_dataset=train_data,
               validation_dataset=validation_data,
               validation_data_path=validation_data_path,
               validation_prediction_path=validation_prediction_path,
               semantics_only=semantics_only,
               warmup_epochs=warmup_epochs,
               syntactic_method=syntactic_method,
               drop_syntax=drop_syntax,
               include_attribute_scores=include_attribute_scores,
               patience=patience,
               validation_metric=validation_metric,
               validation_iterator=validation_iterator,
               shuffle=shuffle,
               num_epochs=num_epochs,
               serialization_dir=serialization_dir,
               cuda_device=cuda_device,
               grad_norm=grad_norm,
               grad_clipping=grad_clipping,
               learning_rate_scheduler=lr_scheduler,
               momentum_scheduler=momentum_scheduler,
               checkpointer=checkpointer,
               model_save_interval=model_save_interval,
               summary_interval=summary_interval,
               histogram_interval=histogram_interval,
               should_log_parameter_statistics=should_log_parameter_statistics,
               should_log_learning_rate=should_log_learning_rate,
               log_batch_size_period=log_batch_size_period,
               moving_average=moving_average,
               accumulate_batches=accumulate_batches)
def train_model(args,
                model: Model,
                train_dataset,
                valid_dataset,
                test_dataset=None,
                metric='fscore'):
    output_model_path = args.model_path

    iterator = BucketIterator(sorting_keys=[('text', 'num_tokens')],
                              batch_size=args.batch)
    iterator.index_with(model.vocab)
    model.vocab.save_to_files(os.path.join(output_model_path, 'vocab'))
    save_model_options(file_path=os.path.join(output_model_path,
                                              'model.option'),
                       options=args)

    optimizer = env_utils.prepare_optimizer(args, model)

    if torch.cuda.is_available():
        cuda_device = args.device

        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    logger.info(model)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        patience=args.patience,
        num_epochs=args.epoch,
        cuda_device=cuda_device,
        serialization_dir=output_model_path,
        num_serialized_models_to_keep=1,
        validation_metric='+' + metric,
        learning_rate_scheduler=LearningRateScheduler.from_params(
            optimizer,
            Params(
                {
                    'type': 'reduce_on_plateau',
                    'patience': args.lr_reduce_patience,
                    'verbose': True,
                    'factor': args.lr_reduce_factor,
                    'mode': 'max'
                }, )),
        automatic_mixed_precision=args.fp16)

    train_result = trainer.train()
    dump_metrics(os.path.join(output_model_path, f'metrics.json'),
                 train_result)

    valid_result = {
        'loss': train_result['best_validation_loss'],
        'precision': train_result['best_validation_precision'],
        'recall': train_result['best_validation_recall'],
        'fscore': train_result['best_validation_fscore'],
        'accuracy': train_result['best_validation_accuracy'],
    }

    result_str = "Final Valid Loss: %.4f, Acc: %.2f, P: %.2f, R: %.2f, F1: %.2f" % (
        valid_result['accuracy'], valid_result['loss'],
        valid_result['precision'], valid_result['recall'],
        valid_result['fscore'])

    logger.info(result_str)

    if test_dataset:
        test_result = evaluate(model,
                               test_dataset,
                               iterator,
                               cuda_device=cuda_device,
                               batch_weight_key="")
        result_str = "Final Test  Loss: %.4f, Acc: %.2f, P: %.2f, R: %.2f, F1: %.2f" % (
            test_result['accuracy'], test_result['loss'],
            test_result['precision'], test_result['recall'],
            test_result['fscore'])
        logger.info(result_str)

    logger.info("Model Path: %s" % output_model_path)