Esempio n. 1
0
    def test_model_for_pretraining_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = AutoModelForPreTraining.from_pretrained(model_name)
            model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForPreTraining)
            for value in loading_info.values():
                self.assertEqual(len(value), 0)
    def test_model_for_pretraining_from_pretrained(self):
        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = AutoModelForPreTraining.from_pretrained(model_name)
            model, loading_info = AutoModelForPreTraining.from_pretrained(
                model_name, output_loading_info=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForPreTraining)
            for key, value in loading_info.items():
                # Only one value should not be initialized and in the missing keys.
                self.assertEqual(len(value), 1 if key == "missing_keys" else 0)
    def test_model_for_pretraining_from_pretrained(self):
        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = AutoModelForPreTraining.from_pretrained(model_name)
            model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForPreTraining)
            # Only one value should not be initialized and in the missing keys.
            missing_keys = loading_info.pop("missing_keys")
            self.assertListEqual(["cls.predictions.decoder.bias"], missing_keys)
            for key, value in loading_info.items():
                self.assertEqual(len(value), 0)
Esempio n. 4
0
def word_preds(datadir: str, ff_size: int):
    log.configure(os.path.join(datadir, "dabert-word-preds.log"), "daBERT word predictions")
    log("Loading metadata")
    with open(os.path.join(datadir, DatasetBuilder.metadata_file)) as f:
        metadata = json.load(f)
    log("Loading model")
    dabert = AutoModelForPreTraining.from_pretrained(daBERT).to(device)
    log("Loading data")
    dataloader = DataLoader(
        datadir,
        metadata,
        dict(),
        device,
    )
    loader = dataloader.get_dataloader(ff_size, None)
    log("Forward passing")
    correct_preds = np.zeros(len(loader))
    for i, batch in tqdm(enumerate(loader), total=len(loader)):
        logits = dabert(batch.words.ids).prediction_logits
        masked_logits = logits[batch.word_mask]
        preds = masked_logits.argmax(dim=1)
        correct_preds[i] = (preds == batch.word_mask_labels).float().mean().cpu()
    log(
        "MLM token prediction accuracy",
        "  Mean: %.4f %%" % (100 * correct_preds.mean()),
        "  Std.: %.4f %%" % (100 * correct_preds.std(ddof=1)),
    )
Esempio n. 5
0
def export_model(
    hf_pretrained_model_name_or_path: str,
    output_base_path: str,
):
    """Retrieve model and tokenizer from Transformers and save all necessary data
    Things saved:
    - Model weights
    - Model config JSON (corresponding to corresponding Transformers model Config object)
    - Tokenizer data
    - JSON file pointing to paths for the above
    Args:
        hf_pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
                        Can be either:

                            - A string, the `model id` of a pretrained model configuration
                              hosted inside a model repo on okhuggingface.co.
                              Valid model ids can be located at the root-level, like
                              ``bert-base-uncased``, or namespaced under a user
                              or organization name, like ``dbmdz/bert-base-german-cased``.
                            - A path to a `directory` containing a configuration file saved using
                              the :meth:`~transformers.PretrainedConfig.save_pretrained` method,
                              or the
                              :meth:`~transformers.PreTrainedModel.save_pretrained` method,
                              e.g., ``./my_model_directory/``.
                            - A path or url to a saved configuration JSON `file`, e.g.,
                              ``./my_model_directory/configuration.json``.
        output_base_path: Base path to save output to
    """
    model = AutoModelForPreTraining.from_pretrained(
        hf_pretrained_model_name_or_path)
    model_type = model.config_class.model_type

    model_fol_path = os.path.join(output_base_path, "model")
    model_path = os.path.join(model_fol_path, f"{model_type}.p")
    model_config_path = os.path.join(model_fol_path, f"{model_type}.json")
    tokenizer_fol_path = os.path.join(output_base_path, "tokenizer")

    os.makedirs(tokenizer_fol_path, exist_ok=True)
    os.makedirs(model_fol_path, exist_ok=True)

    torch.save(model.state_dict(), model_path)
    py_io.write_json(model.config.to_dict(), model_config_path)
    tokenizer = AutoTokenizer.from_pretrained(hf_pretrained_model_name_or_path)
    tokenizer.save_pretrained(tokenizer_fol_path)
    config = {
        "model_type": model_type,
        "model_path": model_path,
        "model_config_path": model_config_path,
    }
    py_io.write_json(config, os.path.join(output_base_path, "config.json"))
    def test_model_for_pretraining_from_pretrained(self):
        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = TFAutoModelForPreTraining.from_pretrained(model_name,
                                                              from_pt=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForPreTraining)

            model = AutoModelForPreTraining.from_pretrained(model_name,
                                                            from_tf=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForPreTraining)
Esempio n. 7
0
 def __init__(self,
              model_name,
              num_epochs,
              batch_size,
              sentence_length,
              device,
              models_dir,
              patience=5):
     self.model_name = model_name
     # self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
     self.model = AutoModelForPreTraining.from_pretrained(model_name)
     self.num_epochs = num_epochs
     self.batch_size = batch_size
     self.device = device
     self.models_dir = models_dir
     self.patience = patience
     # self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
     self.tokenizer = AutoTokenizer.from_pretrained(model_name)
Esempio n. 8
0
    AutoModelForPreTraining,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
)
from transformers_interpret import BaseExplainer

DISTILBERT_MODEL = AutoModelForMaskedLM.from_pretrained(
    "distilbert-base-uncased")
DISTILBERT_TOKENIZER = AutoTokenizer.from_pretrained("distilbert-base-uncased")

GPT2_MODEL = AutoModelForCausalLM.from_pretrained("sshleifer/tiny-gpt2")
GPT2_TOKENIZER = AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2")

BERT_MODEL = AutoModelForPreTraining.from_pretrained(
    "lysandre/tiny-bert-random")
BERT_TOKENIZER = AutoTokenizer.from_pretrained("lysandre/tiny-bert-random")


class DummyExplainer(BaseExplainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def encode(self, text: str = None):
        return self.tokenizer.encode(text, add_special_tokens=False)

    def decode(self, input_ids):
        return self.tokenizer.convert_ids_to_tokens(input_ids[0])

    @property
    def word_attributions(self):
Esempio n. 9
0
 def get_model(self):
     from transformers import AutoModelForPreTraining
     _model = AutoModelForPreTraining.from_pretrained(
         self.pretrained_model_name_or_path)
     self.to_device(_model)
     return _model
Esempio n. 10
0
def run_pretraining(args):
    if args.parallel and args.local_rank == -1:
        run_parallel_pretraining(args)
        return

    if args.local_rank == -1:
        if args.cpu:
            device = torch.device("cpu")
        else:
            device = torch.device("cuda")
        num_workers = 1
        worker_index = 0
    else:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        device = torch.device("cuda", args.local_rank)
        num_workers = torch.distributed.get_world_size()
        worker_index = torch.distributed.get_rank()

    if args.local_rank not in (-1, 0):
        logging.getLogger().setLevel(logging.WARN)

    logger.info(
        "Starting pretraining with the following arguments: %s", json.dumps(vars(args), indent=2, sort_keys=True)
    )

    if args.multilingual:
        dataset_dir_list = args.dataset_dir.split(",")
        dataset_list = [WikipediaPretrainingDataset(d) for d in dataset_dir_list]
    else:
        dataset_list = [WikipediaPretrainingDataset(args.dataset_dir)]

    bert_config = AutoConfig.from_pretrained(args.bert_model_name)

    dataset_size = sum([len(d) for d in dataset_list])
    num_train_steps_per_epoch = math.ceil(dataset_size / args.batch_size)
    num_train_steps = math.ceil(dataset_size / args.batch_size * args.num_epochs)
    train_batch_size = int(args.batch_size / args.gradient_accumulation_steps / num_workers)

    entity_vocab = dataset_list[0].entity_vocab
    config = LukeConfig(
        entity_vocab_size=entity_vocab.size,
        bert_model_name=args.bert_model_name,
        entity_emb_size=args.entity_emb_size,
        **bert_config.to_dict(),
    )
    model = LukePretrainingModel(config)

    global_step = args.global_step

    batch_generator_args = dict(
        batch_size=train_batch_size,
        masked_lm_prob=args.masked_lm_prob,
        masked_entity_prob=args.masked_entity_prob,
        whole_word_masking=args.whole_word_masking,
        unmasked_word_prob=args.unmasked_word_prob,
        random_word_prob=args.random_word_prob,
        unmasked_entity_prob=args.unmasked_entity_prob,
        random_entity_prob=args.random_entity_prob,
        mask_words_in_entity_span=args.mask_words_in_entity_span,
        num_workers=num_workers,
        worker_index=worker_index,
        skip=global_step * args.batch_size,
    )

    if args.multilingual:
        data_size_list = [len(d) for d in dataset_list]
        batch_generator = MultilingualBatchGenerator(
            dataset_dir_list, data_size_list, args.sampling_smoothing, **batch_generator_args,
        )

    else:
        batch_generator = LukePretrainingBatchGenerator(args.dataset_dir, **batch_generator_args)

    logger.info("Model configuration: %s", config)

    if args.fix_bert_weights:
        for param in model.parameters():
            param.requires_grad = False
        for param in model.entity_embeddings.parameters():
            param.requires_grad = True
        for param in model.entity_predictions.parameters():
            param.requires_grad = True

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = LukeAdamW(
        optimizer_parameters,
        lr=args.learning_rate,
        betas=(args.adam_b1, args.adam_b2),
        eps=args.adam_eps,
        grad_avg_device=torch.device("cpu") if args.grad_avg_on_cpu else device,
    )

    if args.fp16:
        from apex import amp

        if args.fp16_opt_level == "O2":
            model, optimizer = amp.initialize(
                model,
                optimizer,
                opt_level=args.fp16_opt_level,
                master_weights=args.fp16_master_weights,
                min_loss_scale=args.fp16_min_loss_scale,
                max_loss_scale=args.fp16_max_loss_scale,
            )
        else:
            model, optimizer = amp.initialize(
                model,
                optimizer,
                opt_level=args.fp16_opt_level,
                min_loss_scale=args.fp16_min_loss_scale,
                max_loss_scale=args.fp16_max_loss_scale,
            )

    if args.model_file is None:
        bert_model = AutoModelForPreTraining.from_pretrained(args.bert_model_name)
        bert_state_dict = bert_model.state_dict()
        model.load_bert_weights(bert_state_dict)

    else:
        model_state_dict = torch.load(args.model_file, map_location="cpu")
        model.load_state_dict(model_state_dict, strict=False)

    if args.optimizer_file is not None:
        optimizer.load_state_dict(torch.load(args.optimizer_file, map_location="cpu"))

    if args.amp_file is not None:
        amp.load_state_dict(torch.load(args.amp_file, map_location="cpu"))

    if args.lr_schedule == "warmup_constant":
        scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps)
    elif args.lr_schedule == "warmup_linear":
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_train_steps
        )
    else:
        raise RuntimeError(f"Invalid scheduler: {args.lr_schedule}")

    if args.scheduler_file is not None:
        scheduler.load_state_dict(torch.load(args.scheduler_file, map_location="cpu"))

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            broadcast_buffers=False,
            find_unused_parameters=True,
        )

    model.train()

    if args.local_rank == -1 or worker_index == 0:
        entity_vocab.save(os.path.join(args.output_dir, ENTITY_VOCAB_FILE))
        metadata = dict(
            model_config=config.to_dict(),
            max_seq_length=dataset_list[0].max_seq_length,
            max_entity_length=dataset_list[0].max_entity_length,
            max_mention_length=dataset_list[0].max_mention_length,
            arguments=vars(args),
        )
        with open(os.path.join(args.output_dir, "metadata.json"), "w") as metadata_file:
            json.dump(metadata, metadata_file, indent=2, sort_keys=True)

    def save_model(model, suffix):
        if args.local_rank != -1:
            model = model.module

        model_file = f"model_{suffix}.bin"
        torch.save(model.state_dict(), os.path.join(args.output_dir, model_file))
        optimizer_file = f"optimizer_{suffix}.bin"
        torch.save(optimizer.state_dict(), os.path.join(args.output_dir, optimizer_file))
        scheduler_file = f"scheduler_{suffix}.bin"
        torch.save(scheduler.state_dict(), os.path.join(args.output_dir, scheduler_file))
        metadata = dict(
            global_step=global_step, model_file=model_file, optimizer_file=optimizer_file, scheduler_file=scheduler_file
        )
        if args.fp16:
            amp_file = f"amp_{suffix}.bin"
            torch.save(amp.state_dict(), os.path.join(args.output_dir, amp_file))
            metadata["amp_file"] = amp_file
        with open(os.path.join(args.output_dir, f"metadata_{suffix}.json"), "w") as f:
            json.dump(metadata, f, indent=2, sort_keys=True)

    if args.local_rank == -1 or worker_index == 0:
        summary_writer = SummaryWriter(args.log_dir)
        pbar = tqdm(total=num_train_steps, initial=global_step)

    tr_loss = 0
    accumulation_count = 0
    results = []
    prev_error = False
    prev_step_time = time.time()
    prev_save_time = time.time()

    for batch in batch_generator.generate_batches():
        try:
            batch = {k: torch.from_numpy(v).to(device) for k, v in batch.items()}
            result = model(**batch)
            loss = result["loss"]
            result = {k: v.to("cpu").detach().numpy() for k, v in result.items()}

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            def maybe_no_sync():
                if (
                    hasattr(model, "no_sync")
                    and num_workers > 1
                    and accumulation_count + 1 != args.gradient_accumulation_steps
                ):
                    return model.no_sync()
                else:
                    return contextlib.ExitStack()

            with maybe_no_sync():
                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

        except RuntimeError:
            if prev_error:
                logger.exception("Consecutive errors have been observed. Exiting...")
                raise
            logger.exception("An unexpected error has occurred. Skipping a batch...")
            prev_error = True
            loss = None
            torch.cuda.empty_cache()
            continue

        accumulation_count += 1
        prev_error = False
        tr_loss += loss.item()
        loss = None
        results.append(result)

        if accumulation_count == args.gradient_accumulation_steps:
            if args.max_grad_norm != 0.0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            accumulation_count = 0

            summary = {}
            summary["learning_rate"] = max(scheduler.get_last_lr())
            summary["loss"] = tr_loss
            tr_loss = 0

            current_time = time.time()
            summary["batch_run_time"] = current_time - prev_step_time
            prev_step_time = current_time

            for name in ("masked_lm", "masked_entity"):
                try:
                    summary[name + "_loss"] = np.concatenate([r[name + "_loss"].flatten() for r in results]).mean()
                    correct = np.concatenate([r[name + "_correct"].flatten() for r in results]).sum()
                    total = np.concatenate([r[name + "_total"].flatten() for r in results]).sum()
                    if total > 0:
                        summary[name + "_acc"] = correct / total
                except KeyError:
                    continue

            results = []

            if args.local_rank == -1 or worker_index == 0:
                for (name, value) in summary.items():
                    summary_writer.add_scalar(name, value, global_step)
                desc = (
                    f"epoch: {int(global_step / num_train_steps_per_epoch)} "
                    f'loss: {summary["loss"]:.4f} '
                    f'time: {datetime.datetime.now().strftime("%H:%M:%S")}'
                )
                pbar.set_description(desc)
                pbar.update()

            global_step += 1

            if args.local_rank == -1 or worker_index == 0:
                if global_step == num_train_steps:
                    # save the final model
                    save_model(model, f"epoch{args.num_epochs}")
                    time.sleep(60)
                elif global_step % num_train_steps_per_epoch == 0:
                    # save the model at each epoch
                    epoch = int(global_step / num_train_steps_per_epoch)
                    save_model(model, f"epoch{epoch}")
                if args.save_interval_sec and time.time() - prev_save_time > args.save_interval_sec:
                    save_model(model, f"step{global_step:07}")
                    prev_save_time = time.time()
                if args.save_interval_steps and global_step % args.save_interval_steps == 0:
                    save_model(model, f"step{global_step}")

            if global_step == num_train_steps:
                break

    if args.local_rank == -1 or worker_index == 0:
        summary_writer.close()
 def __init__(self, bert_model_name, dropout, linear_units):
     super(BERTBaseUncased, self).__init__()
     self.bert = AutoModelForPreTraining.from_pretrained(bert_model_name)
     self.pre_classifier = nn.Linear(30522, linear_units)
     self.bert_drop = nn.Dropout(dropout)
     self.classifier = nn.Linear(linear_units, 2)
 def __init__(self, config, model_dir):
     super(LongformerForClassification, self).__init__(config)
     self.longformer = AutoModelForPreTraining.from_pretrained(
         model_dir, config=config)
     self.classify = torch.nn.Linear(config.hidden_size, config.num_labels)
     self.init_weights()
Esempio n. 13
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if training_args.should_log:
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
    # behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name,
                                cache_dir=model_args.cache_dir)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
            )
    else:
        assert False
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            **config_kwargs)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            **config_kwargs)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
        "use_fast": model_args.use_fast_tokenizer,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name,
                                                  **tokenizer_kwargs)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, **tokenizer_kwargs)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = AutoModelForPreTraining.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if data_args.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(data_args.max_seq_length,
                             tokenizer.model_max_length)

    if data_args.line_by_line:
        pass
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
        # efficient when it receives the `special_tokens_mask`.
        combined_wiki = pickle.load(open("combined_wiki.p", "rb"))

        def tokenize_function(examples):
            new_examples_text = []
            new_examples_title = []

            for article, title in zip(examples['text'], examples['title']):
                temp = article.split("Category:")
                assert len(temp) >= 1
                if len(temp) == 1:
                    assert title not in combined_wiki or len(
                        combined_wiki[title]) == 0
                new_examples_text.append(temp[0])

            examples['text'] = new_examples_text
            new_examples_text = []
            for article, title in zip(examples['text'], examples['title']):
                full_paragraph = []
                for i, paragraph in enumerate(article.split('.\n')):
                    paragraph += '.\n'
                    paragraph = paragraph.split(' ')
                    if len(full_paragraph) + len(paragraph) > 300:
                        new_examples_text.append(' '.join(full_paragraph))
                        new_examples_title.append(title)
                        full_paragraph = []
                    full_paragraph += paragraph
                if full_paragraph:
                    new_examples_text.append(' '.join(full_paragraph))
                    new_examples_title.append(title)
            examples['text'] = new_examples_text
            examples['title'] = new_examples_title
            examples['category_labels'] = []
            for title in examples['title']:
                zeros = torch.zeros(2211)
                zeros[0] = 1
                if title in combined_wiki:
                    for category_idx in combined_wiki[title]:
                        if category_idx in [5, 8, 9]:
                            continue
                        zeros[category_idx] = 1
                        zeros[0] = 0
                examples['category_labels'].append(zeros)
            return tokenizer(examples[text_column_name],
                             return_special_tokens_mask=True,
                             truncation=True,
                             padding=True)

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
        #
        # # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # # max_seq_length.
        # def group_texts(examples):
        #     # Concatenate all texts.
        #     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        #     total_length = len(concatenated_examples[list(examples.keys())[0]])
        #     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        #     # customize this part to your needs.
        #     total_length = (total_length // max_seq_length) * max_seq_length
        #     # Split by chunks of max_len.
        #     result = {
        #         k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        #         for k, t in concatenated_examples.items()
        #     }
        #     return result
        #
        # # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # # might be slower to preprocess.
        # #
        # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
        #
        # tokenized_datasets = tokenized_datasets.map(
        #     group_texts,
        #     batched=True,
        #     num_proc=data_args.preprocessing_num_workers,
        #     load_from_cache_file=not data_args.overwrite_cache,
        # )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = tokenized_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in tokenized_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = tokenized_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))

    # Data collator
    # This one will take care of randomly masking the tokens.
    pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=data_args.mlm_probability,
        pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
    )

    # Initialize our Trainer

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
        try:
            perplexity = math.exp(metrics["eval_loss"])
        except OverflowError:
            perplexity = float("inf")
        metrics["perplexity"] = perplexity

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.push_to_hub:
        kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tags": "fill-mask"
        }
        if data_args.dataset_name is not None:
            kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                kwargs["dataset_args"] = data_args.dataset_config_name
                kwargs[
                    "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                kwargs["dataset"] = data_args.dataset_name

        trainer.push_to_hub(**kwargs)
Esempio n. 14
0
def train(
    rank:           int,
    world_size:     int,
    *,
    resume:         bool,
    location:       str,
    name:           str,
    quiet:          bool,
    save_every:     int,
    validate_every: int,
    post_command:   str,
    explicit_args:  set[str],
    params:         Hyperparams,
):
    # Get filepath within path context
    fpath = lambda path: os.path.join(location, path) if isinstance(path, str) else os.path.join(location, *path)

    # Setup multi-gpu if used
    setup(rank, world_size)

    is_master = rank < 1  # Are we on the main node?
    is_distributed = rank != -1  # Are we performing distributed computing?
    num_workers = torch.distributed.get_world_size() if is_distributed else 1

    # Update locations
    TrainResults.subfolder = name
    Hyperparams.subfolder = name

    # Setup logger
    log.configure(
        os.path.join(location, name, "pretraining-worker=%s.log" % (rank if is_distributed else 0)),
        "DaLUKE pretraining on node %i" % rank,
        log_commit  = True,
        print_level = (Levels.INFO if quiet else Levels.DEBUG) if is_master else None,
        append      = resume,  # Append to existing log file if we are resuming training
    )

    post_time, post_command = parse_post_command(post_command)
    execute_post_command = False
    if post_time:
        log("Quitting in %.2f h and running command '%s'" % ((post_time-time.time())/3600, post_command))

    if resume:
        log("Resuming from %s" % name)
        # Load results and hyperparameters from earlier training
        res = TrainResults.load(location)
        # Close unended profiles
        close_tt(res.tt)
        TT.fuse(res.tt)
        res.tt = TT
        tmp_saved_pu = res.parameter_update
        loaded_params = Hyperparams.load(location)
        # Overwrite ff-size if given explicitly
        if "ff_size" in explicit_args:
            loaded_params.ff_size = params.ff_size
        params = loaded_params
    else:
        tmp_saved_pu = None
    log.section("Starting pretraining with the following hyperparameters", params)
    log("Training using %i workers" % num_workers)

    log("Reading metadata and entity vocabulary")
    with open(fpath(DatasetBuilder.metadata_file)) as f:
        metadata = json.load(f)
    with open(fpath(DatasetBuilder.entity_vocab_file)) as f:
        entity_vocab = json.load(f)
    log("Loaded metadata:", json.dumps(metadata, indent=4))
    log(f"Loaded entity vocabulary of {len(entity_vocab)} entities")
    if params.ent_min_mention:
        log("Removing entities with less than %i mentions" % params.ent_min_mention)
        entity_vocab = { ent: info for ent, info in entity_vocab.items()
            if info["count"] >= params.ent_min_mention or ent in {"[PAD]", "[UNK]", "[MASK]"} }
        log("After filtering, entity vocab now has %i entities" % len(entity_vocab))

    # Device should be cuda:rank or just cuda if single gpu, else cpu
    if is_distributed:
        device = torch.device("cuda", index=rank)
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    log.debug(
        "Hardware for this worker:",
        "CPU: %s" % cpuinfo.get_cpu_info()["brand_raw"],
        "GPU: %s" % (torch.cuda.get_device_name(device) if torch.cuda.is_available() else "NA"),
        sep="\t\n",
    )

    if params.entity_loss_weight:
        log("Setting up loss function with entity loss weighting")
        # Don't weigh special tokens
        weights = torch.Tensor([0, 0, 0, *(1 / info["count"] for info in entity_vocab.values() if info["count"])]).to(device)
        entity_criterion = nn.CrossEntropyLoss(weight=weights)
    else:
        log("Setting up loss function without entity loss weighting")
        entity_criterion = nn.CrossEntropyLoss()
    word_criterion = nn.CrossEntropyLoss()
    loss_calculator = lambda w, e: params.word_ent_weight * w + (1 - params.word_ent_weight) * e

    # Load dataset and training results
    bert_config = AutoConfig.from_pretrained(metadata["base-model"])
    if metadata["reduced-vocab"]:
        token_map_file = fpath(DatasetBuilder.token_map_file)
        log("Loading token map from '%s'" % token_map_file)
        token_map = np.load(token_map_file)
        tokenizer = AutoTokenizer.from_pretrained(metadata["base-model"])
        *__, unk_id = get_special_ids(tokenizer)
        token_reduction = token_map_to_token_reduction(token_map, unk_id)
    else:
        token_map = None

    log("Building dataset")
    data = DataLoader(
        location,
        metadata,
        entity_vocab,
        device,
        params.word_mask_prob,
        params.word_unmask_prob,
        params.word_randword_prob,
        params.ent_mask_prob,
        vocab_size=metadata["vocab-size"],
        token_map=token_map,
        ent_min_mention=params.ent_min_mention,
    )
    sampler = (DistributedSampler if is_distributed else RandomSampler)(data.train_examples)
    log("Built %i examples" % len(data))

    loader = data.get_dataloader(params.ff_size, sampler)
    val_loader = data.get_dataloader(params.ff_size, SequentialSampler(data.val_examples), validation=True)

    # Number of subbatches in each parameter update (batch)
    grad_accumulation_steps = params.batch_size // (params.ff_size * num_workers)
    # How many full batches can be made from the dataset
    batches_in_data = len(data) // params.batch_size
    log(
        "Parameter updates:               %i" % params.parameter_updates,
        "Subbatches per parameter update: %i" % grad_accumulation_steps,
        "Subbatches generated:            %i" % len(loader),
        "Batches needed to cover dataset: %i" % batches_in_data,
    )

    if not resume:
        # Calculate parameter differences, when at least 20k examples have been seen
        paramdiff_every = ceil(MIN_EXAMPLES_PER_PARAMDIFF / params.batch_size)
        log("Recalculating parameter differences every %i'th parameter update" % paramdiff_every)
        top_k = [1, 3, 10]
        log("Calculating top %s accuracies" % top_k)
        if validate_every:
            val_updates = unique(np.array(
                np.arange(-1, params.parameter_updates, validate_every).tolist() + [params.parameter_updates-1]
            ))[1:]
        else:
            val_updates = np.array([], dtype=int)
        res = TrainResults(
            runtime           = np.zeros(params.parameter_updates),
            lr                = np.zeros(params.parameter_updates),
            parameter_update  = 0,

            losses            = np.zeros(params.parameter_updates),
            scaled_loss       = np.zeros(params.parameter_updates),

            top_k             = top_k,
            w_losses          = np.zeros(params.parameter_updates),
            e_losses          = np.zeros(params.parameter_updates),
            w_accuracies      = np.zeros((params.parameter_updates, len(top_k))),
            e_accuracies      = np.zeros((params.parameter_updates, len(top_k))),

            val_param_updates = val_updates,
            val_losses        = np.zeros(len(val_updates)),
            val_w_losses      = np.zeros(len(val_updates)),
            val_e_losses      = np.zeros(len(val_updates)),
            val_w_accuracies  = np.zeros((len(val_updates), len(top_k))),
            val_e_accuracies  = np.zeros((len(val_updates), len(top_k))),

            paramdiff_every   = paramdiff_every,
            groups_to_slices  = None,  # Set later
            orig_params       = None,
            paramdiff_1       = None,

            luke_exclusive_params = None,  # Set later
            att_mats_from_base    = None,  # Set later

            tt = TT,
        )

    save_pus = set(range(-1, params.parameter_updates, save_every)).union({params.parameter_updates-1})
    log("Saving model at parameter updates: %s" % sorted(save_pus),
        "Validating at parameter updates: %s" % res.val_param_updates.tolist())

    # Build model, possibly by loading previous weights
    log.section("Setting up model")
    bert_config = AutoConfig.from_pretrained(metadata["base-model"])
    if params.ent_hidden_size is None:
        params.ent_hidden_size = bert_config.hidden_size
    else:
        assert params.ent_hidden_size <= bert_config.hidden_size,\
            "Entity hidden size (%i) cannot be larger than hidden size in '%s' (%i)" % (
                params.hidden_size,
                metadata["base-model"],
                bert_config.hidden_size,
            )

    log("Initializing model")
    model_cls = BertAttentionPretrainTaskDaLUKE if params.bert_attention else PretrainTaskDaLUKE
    model = model_cls(
        bert_config,
        ent_vocab_size        = len(entity_vocab),
        ent_embed_size        = params.ent_embed_size,
        ent_hidden_size       = params.ent_hidden_size,
        ent_intermediate_size = params.ent_intermediate_size,
    ).to(device)
    bert_config.vocab_size = metadata["vocab-size"]
    log("Bert config", bert_config.to_json_string())

    if params.lukeinit:
        log("Initializing weights in accordance with LUKE")
        model.apply(lambda module: model.init_weights(module, bert_config.initializer_range))
    # Load parameters from base model
    if not params.no_base_model:
        log("Loading base model parameters")
        with TT.profile("Loading base model parameters"):
            base_model = AutoModelForPreTraining.from_pretrained(metadata["base-model"])
            new_weights = load_base_model_weights(
                model,
                base_model.state_dict(),
                params.bert_attention,
            )
            if metadata["reduced-vocab"]:
                log("Removing unneeded token weights")
                reduced_model = model_cls(
                    bert_config,
                    ent_vocab_size        = len(entity_vocab),
                    ent_embed_size        = params.ent_embed_size,
                    ent_hidden_size       = params.ent_hidden_size,
                    ent_intermediate_size = params.ent_intermediate_size,
                ).to(device)
                copy_with_reduced_state_dict(token_reduction, model, reduced_model)
                model = reduced_model
    else:
        new_weights = set(model.state_dict())
    # Initialize self-attention query matrices to BERT word query matrices
    att_mat_keys = set()
    if not params.bert_attention and not params.no_base_model:
        log("Initializing new attention matrices with%s PCA" % ("" if params.pcainit else "out"))
        att_mat_keys = model.init_special_attention(params.pcainit, device)
    if not resume:
        res.luke_exclusive_params = new_weights
        res.att_mats_from_base = att_mat_keys
        if is_master:
            res.orig_params = all_params(model).cpu().numpy()
    log("Pretraining model initialized with %s parameters" % thousand_seps(len(model)))

    # Unfixes params at this parameter update
    unfix_base_model_params_pu = round(params.bert_fix_prop * params.parameter_updates)
    log("Unfixing base model params after %i parameter updates" % unfix_base_model_params_pu)

    if resume:
        mpath = fpath((TrainResults.subfolder, MODEL_OUT.format(i=res.parameter_update)))
        log("Loading model from '%s'" % mpath)
        model.load_state_dict(torch.load(mpath, map_location=device))
        log(f"Resuming training saved at parameter update {res.parameter_update}")
    else:
        res.groups_to_slices, t = all_params_groups_to_slices(model, bert_config.num_hidden_layers)
        log("Parameter groups and positions", t)
        res.paramdiff_1 = { name: np.zeros(ceil(params.parameter_updates/res.paramdiff_every)) for name in res.groups_to_slices }
    if is_distributed:
        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
    non_ddp_model = model.module if is_distributed else model

    log("Setting up optimizer, scaler, and learning rate scheduler")
    optimizer = get_optimizer(non_ddp_model, params.weight_decay, params.lr)
    scaler = amp.GradScaler() if params.fp16 else None
    scheduler = get_lr_scheduler(
        optimizer,
        int(params.warmup_prop * params.parameter_updates),
        params.parameter_updates,
        unfix_base_model_params_pu,
    )
    if resume:
        optimizer.load_state_dict(torch.load(fpath((TrainResults.subfolder, OPTIMIZER_OUT.format(i=res.parameter_update))), map_location=device))
        scheduler.load_state_dict(torch.load(fpath((TrainResults.subfolder, SCHEDULER_OUT.format(i=res.parameter_update))), map_location=device))
        if params.fp16:
            scaler.load_state_dict(torch.load(fpath((TrainResults.subfolder, SCALER_OUT.format(i=res.parameter_update))), map_location=device))
        res.parameter_update += 1  # We saved the data at pu i, but should now commence pu i+1

    log.debug("Time distribution before starting training", TT)
    log_memory_stats(device)

    log.section(f"Training DaLUKE for {params.parameter_updates} parameter updates")
    model.zero_grad()  # To avoid tracking of model parameter manipulation
    model.train()

    # Start with transfer learned weights locked
    fix_base_model_params(res.luke_exclusive_params, non_ddp_model, True)
    fixed_params = True

    # Save initial parameters
    if is_master and not resume:
        with TT.profile("Saving progress"):
            paths = save_training(location, params, model.module if is_distributed else model,
                res, optimizer, scheduler, scaler, -1)
            log.debug("Saved initial state to", *paths)

    batch_iter = iter(loader)
    for i in range(res.parameter_update, params.parameter_updates):
        TT.profile("Parameter update")
        res.parameter_update = i
        if i >= unfix_base_model_params_pu and fixed_params:
            log("Unfixing base model params")
            fix_base_model_params(res.luke_exclusive_params, model, False)
            fixed_params = False
        if is_distributed and i % batches_in_data == 0:
            sampler.set_epoch(i // batches_in_data)

        # Losses and accuracies for this parameter update
        t_loss, w_loss, e_loss, s_loss = 0, 0, 0, 0
        w_accuracies = np.zeros((grad_accumulation_steps, len(res.top_k)))
        e_accuracies = np.zeros((grad_accumulation_steps, len(res.top_k)))

        # Loop over enough batches to make a parameter update
        for j in range(grad_accumulation_steps):
            TT.profile("Sub-batch")
            try:
                batch = next(batch_iter)
            except StopIteration:
                batch_iter = iter(loader)
                batch = next(batch_iter)

            TT.profile("FP and gradients")
            with amp.autocast() if params.fp16 else contextlib.ExitStack():
                word_preds, ent_preds = model(batch)
                # Compute and backpropagate loss
                word_loss = word_criterion(word_preds, batch.word_mask_labels)
                ent_loss = entity_criterion(ent_preds, batch.ent_mask_labels)
                has_entities = not torch.isnan(ent_loss).item()
                ent_loss = torch.nan_to_num(ent_loss)
            loss = loss_calculator(word_loss, ent_loss)
            loss /= grad_accumulation_steps

            # Only sync parameters on grad updates, aka last pass of this loop
            with model.no_sync() if is_distributed and j < grad_accumulation_steps - 1 else contextlib.ExitStack():
                if params.fp16:
                    scaled_loss = scaler.scale(loss)
                    scaled_loss.backward()
                    s_loss += scaled_loss.item()
                else:
                    loss.backward()

            t_loss += loss.item()
            w_loss += word_loss.item() / grad_accumulation_steps
            e_loss += ent_loss.item() / grad_accumulation_steps if has_entities else 0

            if torch.cuda.is_available():
                torch.cuda.synchronize(rank if is_distributed else None)

            TT.end_profile()

            # Save accuracy for statistics
            if is_master:
                with TT.profile("Training accuracy"):
                    w_accuracies[j] = top_k_accuracy(batch.word_mask_labels, word_preds, res.top_k)
                    e_accuracies[j] = top_k_accuracy(batch.ent_mask_labels, ent_preds, res.top_k)

            TT.end_profile()

        # Update model parameters
        with TT.profile("Parameter step"):
            if params.fp16:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            scheduler.step()
            model.zero_grad()

        # Calculate how much gradient has changed
        if is_master and i % res.paramdiff_every == 0:
            with torch.no_grad(), TT.profile("Parameter changes"):
                log.debug("Calculating parameter changes")
                orig_pars = torch.from_numpy(res.orig_params).to(device)
                current_pars = all_params(model.module if is_distributed else model)
                absdiff = torch.abs(current_pars-orig_pars)
                for blockname, slice_ in res.groups_to_slices.items():
                    j = i // res.paramdiff_every
                    res.paramdiff_1[blockname][j] = absdiff[slice_].sum().item()
                del orig_pars, current_pars

        res.losses[i]       = t_loss
        res.w_losses[i]     = w_loss
        res.e_losses[i]     = e_loss
        res.scaled_loss[i]  = s_loss
        res.lr[i]           = scheduler.get_last_lr()[0]
        res.w_accuracies[i] = np.mean(w_accuracies, axis=0)
        res.e_accuracies[i] = np.nanmean(e_accuracies, axis=0)
        res.runtime[i]      = TT.end_profile()
        log.debug(
            "Performed parameter update %i / %i in %.2f s" % (i, params.parameter_updates-1, res.runtime[i]),
            f"  Loss (total, word, entity, scaled): {t_loss:9.4f}, {w_loss:9.4f}, {e_loss:9.4f}, {s_loss:.4f}",
            f"  Accuracy (word, entity): {100*res.w_accuracies[i, 0]:7.2f} %, {100*res.e_accuracies[i, 0]:7.2f} %",
        )

        if i in res.val_param_updates and is_master:
            TT.profile("Model validation")
            log("Validating model")
            vi = res.val_param_updates.tolist().index(i)
            res.val_w_losses[vi], res.val_e_losses[vi], res.val_w_accuracies[vi], res.val_e_accuracies[vi] =\
                validate_model(model, val_loader, word_criterion, entity_criterion, res.top_k)
            res.val_losses[vi] = loss_calculator(res.val_w_losses[vi], res.val_e_losses[vi])
            log(
                "Validation loss:",
                "  Total:  %9.4f" % res.val_losses[vi],
                "  Word:   %9.4f" % res.val_w_losses[vi],
                "  Entity: %9.4f" % res.val_e_losses[vi],
                "Validation accuracy:",
                "  Word:   %7.2f %%" % (100 * res.val_w_accuracies[vi, 0]),
                "  Entity: %7.2f %%" % (100 * res.val_e_accuracies[vi, 0]),
            )
            model.train()
            TT.end_profile()
            log.debug("Time distribution so far", TT)

        # Save results and model
        if is_master and i in save_pus:
            with TT.profile("Saving progress"):
                save_progress(location, i, tmp_saved_pu, save_pus, params,
                    model.module if is_distributed else model, res, optimizer, scheduler, scaler)
        if i in save_pus:
            log_memory_stats(device)

        # If timed out, save, quit, and run resume command
        if post_time and time.time() > post_time:
            log_memory_stats(device)
            log.section("Time limit reached. Quitting and running command '%s'" % post_command)
            with TT.profile("Saving progress"):
                save_progress(location, i, tmp_saved_pu, save_pus, params,
                    model.module if is_distributed else model, res, optimizer, scheduler, scaler)
            execute_post_command = True
            break

    log.debug("Time distribution", TT)

    # Clean up multi-gpu if used
    cleanup(rank)

    if is_master and execute_post_command:
        os.system(post_command)
Esempio n. 15
0
import torch
from transformers import AutoTokenizer, AutoModelForPreTraining

path = '/home/halamri/summer2020/avsd-transofrmers/mlmAVSD/'

tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForPreTraining.from_pretrained(path)

ques = 'Hey..how are you?'
ans = 'I will be just fine !'

input_ids = tokenizer.encode_plus(text=[ques, ans],
                                  add_special_tokens=True,
                                  max_length=200,
                                  pad_to_max_length=True,
                                  return_tensors='pt')

model.eval()
output = model(input_ids=input_ids.data['input_ids'],
               attention_mask=input_ids.data['attention_mask'],
               token_type_ids=input_ids.data['token_type_ids'])

print('Test..')
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = AutoModelForPreTraining.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForLanguageModelingAVSD(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        #prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    avsd_eval = True
    outputs = []
    if avsd_eval:
        outputs = []
        model = trainer.model
        data_loader = trainer.get_eval_dataloader(eval_dataset)
        for batch in tqdm(data_loader):

            for k, v in batch.items():
                batch[k] = v.to(training_args.device)

            with torch.no_grad():
                output = model(**batch)
                nsp_scores = output[1]
                #nsp_probs = F.softmax(nsp_scores, dim=1)
                nsp_scores = nsp_scores[:, 0].detach().cpu().tolist()
                outputs.extend(nsp_scores)

        results['avsd_train_set'] = outputs
        json.dump(
            results,
            open(
                '/home/halamri/summer2020/avsd-transofrmers/mlmAVSD/bertPredicitonResults_June15_corrected.txt',
                'w'))

    print('Done....')
    '''
Esempio n. 17
0
def prepare_model_and_optimizer(args, device):
    global_step = 0
    args.resume_step = 0
    checkpoint = None
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    config.dense_seq_output = args.dense_seq_output 
    if args.model_name_or_path:
        model = AutoModelForPreTraining.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForPreTraining.from_config(config)
    ## Load from Pyt checkpoint - either given as init_checkpoint, or picked up from output_dir if found
    #if args.init_checkpoint is not None or found_resume_checkpoint(args):
    #    # Prepare model
    #    #model = BertForPreTraining(config)
    #    model = BertForPreTrainingSegmented(config)

    #    # for k,v in model.state_dict().items():
    #    #     print(f'model-k,len(v)={k}, {v.numel()}')

    #    #model = BertForPretraining(config)
    #    if args.init_checkpoint is None: # finding checkpoint in output_dir
    #        assert False, "code path not tested with cuda graphs"
    #        checkpoint_str = "phase2_ckpt_*.pt" if args.phase2 else "phase1_ckpt_*.pt"
    #        model_names = [f for f in glob.glob(os.path.join(args.output_dir, checkpoint_str))]
    #        global_step = max([int(x.split('.pt')[0].split('_')[-1].strip()) for x in model_names])
    #        args.resume_step = global_step #used for throughput computation

    #        resume_init_checkpoint = os.path.join(args.output_dir, checkpoint_str.replace("*", str(global_step)))
    #        print("Setting init checkpoint to %s - which is the latest in %s" %(resume_init_checkpoint, args.output_dir))
    #        checkpoint=torch.load(resume_init_checkpoint, map_location="cpu")
    #    else:
    #        checkpoint=torch.load(args.init_checkpoint, map_location="cpu")["model"]
    param_optimizer = list(model.named_parameters())
    
    no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate,  betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2), fused=True)

    if args.warmup_steps == 0:
        warmup_steps = int(args.max_steps * args.warmup_proportion)
        warmup_start = 0
    else:
        warmup_steps = args.warmup_steps
        warmup_start = args.start_warmup_step
    
    lr_scheduler = LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start, warmup_steps=warmup_steps,
                                                  total_steps=args.max_steps, end_learning_rate=0.0, degree=1.0)
    #if found_resume_checkpoint(args):
    #    assert False, "code path not tested with cuda graphs"
    #    optimizer.load_state_dict(checkpoint['optimizer']) #restores m,v states (only if resuming checkpoint, not for init_checkpoint and init_tf_checkpoint for now)
    return model, optimizer, lr_scheduler, checkpoint, global_step
Esempio n. 18
0
#based on https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

import torch
from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer
from transformers import AutoModelForPreTraining 
from pathlib import Path
import os

import torch
print(torch.cuda.is_available())

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
model = AutoModelForPreTraining.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="../results_file.txt",
    block_size=128,
)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)