Example #1
0
File: train.py Project: Wkryst/FARM
 def log_params(self):
     params = {
         "epochs": self.epochs,
         "n_gpu": self.n_gpu,
         "device": self.device
     }
     MlLogger.log_params(params)
Example #2
0
 def log_params(self):
     """
     Logs paramteres to generic logger MlLogger
     """
     params = {
         "lm1_type":
         self.language_model1.__class__.__name__,
         "lm1_name":
         self.language_model1.name,
         "lm1_output_types":
         ",".join(self.lm1_output_types),
         "lm2_type":
         self.language_model2.__class__.__name__,
         "lm2_name":
         self.language_model2.name,
         "lm2_output_types":
         ",".join(self.lm2_output_types),
         "prediction_heads":
         ",".join(
             [head.__class__.__name__ for head in self.prediction_heads])
     }
     try:
         MlLogger.log_params(params)
     except Exception as e:
         logger.warning(f"ML logging didn't work: {e}")
Example #3
0
    def _calculate_statistics(self, ):
        self.counts = {
            "train": len(self.data["train"]),
            "dev": len(self.data["dev"]),
            "test": len(self.data.get("test", [])),
        }

        train_input_numpy = self.data["train"][:][0].numpy()
        seq_lens = np.sum(train_input_numpy != 0, axis=1)
        self.ave_len = np.mean(seq_lens)
        max_seq_len = self.data["train"][:][0].shape[1]
        self.clipped = np.mean(seq_lens == max_seq_len)

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))
        logger.info("")
        logger.info("Max sequence length:     {}".format(max(seq_lens)))
        logger.info("Average sequence length: {}".format(self.ave_len))
        logger.info("Proportion clipped:      {}".format(self.clipped))
        if self.clipped > 0.5:
            logger.info(
                "[Farmer's Tip] {}% of your samples got cut down to {} tokens. "
                "Consider increasing max_seq_len. "
                "This will lead to higher memory consumption but is likely to "
                "improve your model performance".format(
                    round(self.clipped * 100, 1), max_seq_len))

        MlLogger.log_params({
            "n_samples_train": self.counts["train"],
            "n_samples_dev": self.counts["train"],
            "n_samples_test": self.counts["train"],
            "ave_seq_len": self.ave_len,
            "clipped": self.clipped
        })
Example #4
0
def eval_question_similarity(y_true,
                             y_pred,
                             lang,
                             model_name,
                             params,
                             user=None,
                             log_to_mlflow=True,
                             run_name="default"):
    # basic metrics
    mean_diff = np.mean(np.abs(y_true - y_pred))
    roc_auc = roc_auc_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred.round(0))
    metrics = {"roc_auc": roc_auc, "mean_abs_diff": mean_diff, "f1_score": f1}
    print(metrics)

    # log experiment results to MLFlow (visit https://public-mlflow.deepset.ai/)
    if log_to_mlflow:
        params["lang"] = lang
        params["model_name"] = model_name
        if user:
            params["user"] = user

        ml_logger = MLFlowLogger(
            tracking_uri="https://public-mlflow.deepset.ai/")
        ml_logger.init_experiment(experiment_name="COVID-question-sim",
                                  run_name=run_name)
        ml_logger.log_params(params)
        ml_logger.log_metrics(metrics, step=0)
Example #5
0
    def _calculate_statistics(self, ):
        self.counts = {
            "train": len(self.data["train"]),
            "dev": len(self.data["dev"]),
            "test": len(self.data.get("test", [])),
        }

        train_input_numpy = self.data["train"][:][0].numpy()
        seq_lens = np.sum(train_input_numpy != 0, axis=1)
        self.ave_len = np.mean(seq_lens)
        max_seq_len = self.data["train"][:][0].shape[1]
        self.clipped = np.mean(seq_lens == max_seq_len)

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))
        logger.info("")
        logger.info("Max sequence length:     {}".format(max(seq_lens)))
        logger.info("Average sequence length: {}".format(self.ave_len))
        logger.info("Proportion clipped:      {}".format(self.clipped))

        MlLogger.log_params({
            "n_samples_train": self.counts["train"],
            "n_samples_dev": self.counts["train"],
            "n_samples_test": self.counts["train"],
            "ave_seq_len": self.ave_len,
            "clipped": self.clipped
        })
Example #6
0
def _get_optim(model, opts):
    """ Get the optimizer based on dictionary with options. Options are passed to the optimizer constructor.

    :param model: model to optimize
    :param opts: config dictionary that will be passed to optimizer together with the params
    (e.g. lr, weight_decay, correct_bias ...). no_decay' can be given - parameters containing any of those strings
    will have weight_decay set to 0.
    :return: created optimizer
    """

    optimizer_name = opts.pop('name', None)

    # Logging
    logger.info(f"Loading optimizer `{optimizer_name}`: '{opts}'")
    MlLogger.log_params(opts)
    MlLogger.log_params({"optimizer_name": optimizer_name})

    weight_decay = opts.pop('weight_decay', None)
    no_decay = opts.pop('no_decay', None)

    if no_decay:
        optimizable_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad],
             **opts},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
             'weight_decay': 0.0,
             **opts}
        ]
    else:
        optimizable_parameters = [{'params': [p for p in model.parameters() if p.requires_grad], **opts}]

    # default weight decay is not the same for all optimizers, so we can't use default value
    # only explicitly add weight decay if it's given
    if weight_decay is not None:
        optimizable_parameters[0]['weight_decay'] = weight_decay

    # Import optimizer by checking in order: torch, transformers, apex and local imports
    try:
        optim_constructor = getattr(import_module('torch.optim'), optimizer_name)
    except AttributeError:
        try:
            optim_constructor = getattr(import_module('transformers.optimization'), optimizer_name)
        except AttributeError:
            try:
                optim_constructor = getattr(import_module('apex.optimizers'), optimizer_name)
            except (AttributeError, ImportError):
                try:
                    # Workaround to allow loading AdamW from transformers
                    # pytorch > 1.2 has now also a AdamW (but without the option to set bias_correction = False,
                    # which is done in the original BERT implementation)
                    optim_constructor = getattr(sys.modules[__name__], optimizer_name)
                except (AttributeError, ImportError):
                    raise AttributeError(f"Optimizer '{optimizer_name}' not found in 'torch', 'transformers', 'apex' or 'local imports")

    return optim_constructor(optimizable_parameters)
Example #7
0
    def _calculate_statistics(self):
        """ Calculate and log simple summary statistics of the datasets """

        self.counts = {}

        if self.data["train"]:
            self.counts["train"] = len(self.data["train"])
        else:
            self.counts["train"] = 0

        if self.data["dev"]:
            self.counts["dev"] = len(self.data["dev"])
        else:
            self.counts["dev"] = 0

        if self.data["test"]:
            self.counts["test"] = len(self.data["test"])
        else:
            self.counts["test"] = 0

        seq_lens = []
        if self.data["train"]:
            for dataset in self.data["train"].datasets:
                train_input_numpy = dataset[:][0].numpy()
                seq_lens.extend(np.sum(train_input_numpy != self.processor.tokenizer.pad_token_id, axis=1))
            max_seq_len = dataset[:][0].shape[1]

        self.clipped = np.mean(np.array(seq_lens) == max_seq_len) if seq_lens else 0
        self.ave_len = np.mean(seq_lens) if seq_lens else 0

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))
        logger.info("")
        if self.data["train"]:
            logger.info("Longest sequence length observed after clipping:     {}".format(max(seq_lens)))
            logger.info("Average sequence length after clipping: {}".format(self.ave_len))
            logger.info("Proportion clipped:      {}".format(self.clipped))
            if self.clipped > 0.5:
                logger.info("[Farmer's Tip] {}% of your samples got cut down to {} tokens. "
                            "Consider increasing max_seq_len. "
                            "This will lead to higher memory consumption but is likely to "
                            "improve your model performance".format(round(self.clipped * 100, 1), max_seq_len))

        MlLogger.log_params(
            {
                "n_samples_train": self.counts["train"],
                "n_samples_dev": self.counts["dev"],
                "n_samples_test": self.counts["test"],
                "batch_size": self.batch_size,
                "ave_seq_len": self.ave_len,
                "clipped": self.clipped,
            }
        )
Example #8
0
 def _log_params(self):
     params = {
         "processor": self.__class__.__name__,
         "tokenizer": self.tokenizer.__class__.__name__,
     }
     names = ["max_seq_len", "dev_split"]
     for name in names:
         value = getattr(self, name)
         params.update({name: str(value)})
     try:
         MlLogger.log_params(params)
     except Exception as e:
         logger.warning(f"ML logging didn't work: {e}")
Example #9
0
    def _calculate_statistics(self, ):
        self.counts = {
            "train": len(self.data["train"]),
            "dev": len(self.data["dev"]),
            "test": len(self.data.get("test", [])),
        }

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))

        MlLogger.log_params({
            "n_samples_train": self.counts["train"],
            "n_samples_dev": self.counts["train"],
            "n_samples_test": self.counts["train"],
        })
Example #10
0
 def log_params(self):
     """
     Logs paramteres to generic logger MlLogger
     :return: just log into the void
     """
     params = {
         "lm": self.language_model.__class__.__name__,
         "prediction_heads": ",".join(
             [head.__class__.__name__ for head in self.prediction_heads]
         ),
         "lm_output_types": ",".join(self.lm_output_types),
     }
     try:
         MlLogger.log_params(params)
     except Exception as e:
         logger.warning(f"ML logging didn't work: {e}")
Example #11
0
def get_scheduler(optimizer, opts):
    """ Get the scheduler based on dictionary with options. Options are passed to the scheduler constructor.

    :param optimizer: optimizer whose learning rate to control
    :param opts: dictionary of args to be passed to constructor of schedule
    :return: created scheduler
    """
    schedule_name = opts.get('name')
    try:
        sched_constructor = getattr(import_module('torch.optim.lr_scheduler'),
                                    schedule_name)
    except AttributeError:
        try:
            # The method names in transformers became quite long and unhandy.
            # for convenience we offer usage of shorter alias (e.g. "LinearWarmup")
            scheduler_translations = {
                "LinearWarmup":
                "get_linear_schedule_with_warmup",
                "ConstantWarmup":
                "get_constant_schedule_with_warmup",
                "Constant":
                "get_constant_schedule",
                "CosineWarmup":
                "get_cosine_schedule_with_warmup",
                "CosineWarmupWithRestarts":
                "get_cosine_with_hard_restarts_schedule_with_warmup"
            }
            if schedule_name in scheduler_translations.keys():
                schedule_name = scheduler_translations[schedule_name]
            # in contrast to torch, we actually get here a method and not a class
            sched_constructor = getattr(
                import_module('transformers.optimization'), schedule_name)
        except AttributeError:
            raise AttributeError(
                f"Scheduler '{schedule_name}' not found in 'torch' or 'transformers'"
            )

    logger.info(f"Using scheduler '{schedule_name}'")

    # get supported args of constructor
    allowed_args = inspect.signature(sched_constructor).parameters.keys()

    # convert from warmup proportion to steps if required
    if 'num_warmup_steps' in allowed_args and 'num_warmup_steps' not in opts and 'warmup_proportion' in opts:
        opts['num_warmup_steps'] = int(opts["warmup_proportion"] *
                                       opts["num_training_steps"])
        MlLogger.log_params({"warmup_proportion": opts["warmup_proportion"]})

    # only pass args that are supported by the constructor
    constructor_opts = {k: v for k, v in opts.items() if k in allowed_args}

    # Logging
    logger.info(f"Loading schedule `{schedule_name}`: '{constructor_opts}'")
    MlLogger.log_params(constructor_opts)
    MlLogger.log_params({"schedule_name": schedule_name})

    scheduler = sched_constructor(optimizer, **constructor_opts)
    scheduler.opts = opts  # save the opts with the scheduler to use in load/save
    return scheduler
Example #12
0
def initialize_optimizer(model,
                         n_batches,
                         n_epochs,
                         device,
                         learning_rate,
                         optimizer_opts=None,
                         schedule_opts=None,
                         distributed=False,
                         grad_acc_steps=1,
                         local_rank=-1,
                         use_amp=None):
    """
    Initializes an optimizer, a learning rate scheduler and converts the model if needed (e.g for mixed precision).
    Per default, we use transformers' AdamW and a linear warmup schedule with warmup ratio 0.1.
    You can easily switch optimizer and schedule via `optimizer_opts` and `schedule_opts`.

    :param model: model to optimize (e.g. trimming weights to fp16 / mixed precision)
    :type model: AdaptiveModel
    :param n_batches: number of batches for training
    :type n_batches: int
    :param n_epochs: number of epochs for training
    :param device:
    :param learning_rate: Learning rate
    :type learning_rate: float
    :param optimizer_opts: Dict to customize the optimizer. Choose any optimizer available from torch.optim, apex.optimizers or
                           transformers.optimization by supplying the class name and the parameters for the constructor.
                           Examples:
                           1) AdamW from Transformers (Default):
                           {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
                           2) SGD from pytorch:
                           {"name": "SGD", "momentum": 0.0}
                           3) FusedLAMB from apex:
                           {"name": "FusedLAMB", "bias_correction": True}
    :param schedule_opts: Dict to customize the learning rate schedule.
                          Choose any Schedule from Pytorch or Huggingface's Transformers by supplying the class name
                          and the parameters needed by the constructor.
                          If the dict does not contain ``num_training_steps`` it will be set by
                          calculating it from ``n_batches``, ``grad_acc_steps`` and ``n_epochs``.
                          Examples:
                          1) Linear Warmup (Default):
                          {"name": "LinearWarmup",
                          "num_warmup_steps": 0.1 * num_training_steps,
                          "num_training_steps": num_training_steps}
                          2) CosineWarmup:
                          {"name": "CosineWarmup",
                          "num_warmup_steps": 0.1 * num_training_steps,
                          "num_training_steps": num_training_steps}
                          3) CyclicLR from pytorch:
                          {"name": "CyclicLR", "base_lr": 1e-5, "max_lr":1e-4, "step_size_up": 100}
    :param distributed: Whether training on distributed machines
    :param grad_acc_steps: Number of steps to accumulate gradients for. Helpful to mimic large batch_sizes on small machines.
    :param local_rank: rank of the machine in a distributed setting
    :param use_amp: Optimization level of nvidia's automatic mixed precision (AMP). The higher the level, the faster the model.
                    Options:
                    "O0" (Normal FP32 training)
                    "O1" (Mixed Precision => Recommended)
                    "O2" (Almost FP16)
                    "O3" (Pure FP16).
                    See details on: https://nvidia.github.io/apex/amp.html
    :return: model, optimizer, scheduler
    """

    if use_amp and not AMP_AVAILABLE:
        raise ImportError(
            f'Got use_amp = {use_amp}, but cannot find apex. '
            'Please install Apex if you want to make use of automatic mixed precision. '
            'https://github.com/NVIDIA/apex')

    if (schedule_opts is not None) and (not isinstance(schedule_opts, dict)):
        raise TypeError('Parameter schedule_opts must be None or '
                        'an instance of dict but was {}!'.format(
                            type(schedule_opts)))

    num_train_optimization_steps = int(n_batches / grad_acc_steps) * n_epochs

    # Use some defaults to simplify life of inexperienced users
    if optimizer_opts is None:
        optimizer_opts = {
            "name": "TransformersAdamW",
            "correct_bias": False,
            "weight_decay": 0.01
        }
    optimizer_opts["lr"] = learning_rate

    if schedule_opts is None:
        # Default schedule: Linear Warmup with 10% warmup
        schedule_opts = {
            "name": "LinearWarmup",
            "num_warmup_steps": 0.1 * num_train_optimization_steps,
            "num_training_steps": num_train_optimization_steps
        }

        # schedule_opts = {"name": "OneCycleLR", "max_lr":learning_rate, "pct_start": 0.1,
        #                  "total_steps": num_train_optimization_steps }
    elif "num_training_steps" not in schedule_opts:
        schedule_opts["num_training_steps"] = num_train_optimization_steps

    # Log params
    MlLogger.log_params({
        "use_amp":
        use_amp,
        "num_train_optimization_steps":
        schedule_opts["num_training_steps"],
    })

    # Get optimizer from pytorch, transformers or apex
    optimizer = _get_optim(model, optimizer_opts)

    # Adjust for parallel training + amp
    model, optimizer = optimize_model(model, device, local_rank, optimizer,
                                      distributed, use_amp)

    # Get learning rate schedule - moved below to supress warning
    scheduler = get_scheduler(optimizer, schedule_opts)

    return model, optimizer, scheduler
Example #13
0
def initialize_optimizer(
    model,
    n_examples,
    batch_size,
    n_epochs,
    warmup_proportion=0.1,
    learning_rate=2e-5,
    fp16=False,
    loss_scale=0,
    grad_acc_steps=1,
    local_rank=-1,
):
    num_train_optimization_steps = calculate_optimization_steps(
        n_examples, batch_size, grad_acc_steps, n_epochs, local_rank)

    # Log params
    MlLogger.log_params({
        "learning_rate":
        learning_rate,
        "warmup_proportion":
        warmup_proportion,
        "fp16":
        fp16,
        "num_train_optimization_steps":
        num_train_optimization_steps,
    })
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]
    if fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(
            optimizer_grouped_parameters,
            lr=learning_rate,
            bias_correction=False,
            max_grad_norm=1.0,
        )
        if loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=warmup_proportion, t_total=num_train_optimization_steps)
        return optimizer, warmup_linear

    else:
        optimizer = BertAdam(
            optimizer_grouped_parameters,
            lr=learning_rate,
            warmup=warmup_proportion,
            t_total=num_train_optimization_steps,
        )
        return optimizer, None