Ejemplo n.º 1
0
  def __init__(self, model_spec, model_dir, device):
    """Constructor for HfModel class.

    Args:
      model_spec: A str to pass into the `pretrained_model_name_or_path`
        argument of `transformers.T5ForConditionalGeneration.from_pretrained`
        (e.g. `"t5-base"` or a path to a previously trained model) or an
        instance of the `transformers.configuration_t5.T5Config` class to use
        to directly construct the `transformers.T5ForConditionalGeneration`
        object.
      model_dir: str, directory to save and load model checkpoints.
      device: `torch.device` on which the model should be run.
    """
    # We have to import transformers here because it has a side effect of
    # creating a TensorFlow graph, which prevents eager execution from being
    # enabled in files that import hf_model.py
    import transformers  # pylint: disable=import-outside-toplevel,g-import-not-at-top
    if isinstance(model_spec, str):
      self._model = transformers.T5ForConditionalGeneration.from_pretrained(
          model_spec
      )
    elif isinstance(model_spec, transformers.T5Config):
      self._model = transformers.T5ForConditionalGeneration(model_spec)
    else:
      raise ValueError("model_spec should be a string or T5Config.")

    tf.io.gfile.makedirs(model_dir)
    self._writer = torch.utils.tensorboard.writer.SummaryWriter(model_dir)
    self._model_dir = model_dir
    self._device = device
    if self._device.type == "cuda":
      self._model.cuda()
    self._step = 0
    self.load_latest_checkpoint()
    self.to_tensor = functools.partial(torch.as_tensor, device=self._device)
def init_model(args):
    # Load dataset, tokenizer, model from pretrained model/vocabulary

    ## google의 sentencepiece tokenizer
    tokenizer = transformers.T5Tokenizer.from_pretrained(args.tokenizer_path)
    special_tokens = ['<mask{}>'.format(d) for d in range(0, 100)]
    special_tokens += ['<unused{}>'.format(d) for d in range(0, 100)]
    special_tokens_dict = {
        'bos_token': '<s>',
        'sep_token': '<sep>',
        'cls_token': '<cls>',
        'mask_token': '<mask>',
        'additional_special_tokens': special_tokens
    }
    tokenizer.add_special_tokens(special_tokens_dict)
    if args.weights == None:
        model = transformers.T5ForConditionalGeneration(
            vocab_size=tokenizer.vocab_size)
    else:
        logging.info('Load {}.'.format(args.weights))
        model = transformers.T5ForConditionalGeneration.from_pretrained(
            args.weights)
        if model.config.vocab_size != tokenizer.vocab_size:
            logging.info('Resize embedding {} -> {}.'.format(
                model.config.vocab_size, tokenizer.vocab_size))
            model.resize_token_embeddings(tokenizer.vocab_size)
    model.eval()

    loss_func = MaskedCrossEntropyLoss()
    if torch.cuda.device_count() > 1:
        logging.info('Training in multi GPU mode using {} GPUs.'.format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    if torch.cuda.is_available():
        model.to('cuda')
        loss_func.to('cuda')
    return tokenizer, model, loss_func
Ejemplo n.º 3
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    accelerator = Accelerator()

    parser = transformers.HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
            os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir)
            and training_args.do_train
            and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome."
        )

    if accelerator.is_local_main_process:
       # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            level=logging.INFO,
            datefmt="[%X]",
        )

        logger = logging.getLogger(__name__)

        # Set the verbosity to info of the Transformers logger (on main process only):
        logger.info(f"Training/evaluation parameters {training_args}")

        if not os.path.exists(training_args.output_dir):
            os.makedirs(training_args.output_dir)
            logger.info(f"Created output_dir at {training_args.output_dir}")

    # Set seed before initializing model.
    transformers.set_seed(training_args.seed)

    if data_args.dataset_pickle_path is not None:
        if accelerator.is_local_main_process:
            logger.info("Loading processed data from pickle file.")

        with open(data_args.dataset_pickle_path, "rb") as f:
            tokenized_datasets = pickle.load(f)
        if accelerator.is_local_main_process:
            logger.info("Done loading pickle data.")
    else:
        # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
        # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
        # (the dataset will be downloaded automatically from the datasets Hub).
        #
        # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
        # 'text' is found. You can easily tweak this behavior (see below).
        if data_args.dataset_name is not None:
            # Downloading and loading a dataset from the hub.
            datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)

            if "validation" not in datasets.keys():
                datasets["validation"] = load_dataset(
                    data_args.dataset_name,
                    data_args.dataset_config_name,
                    split=f"train[:{data_args.validation_split_percentage}%]",
                    cache_dir=model_args.cache_dir,
                )
                datasets["train"] = load_dataset(
                    data_args.dataset_name,
                    data_args.dataset_config_name,
                    split=f"train[{data_args.validation_split_percentage}%:]",
                    cache_dir=model_args.cache_dir,
                )
        else:
            data_files = {}
            if data_args.train_file is not None:
                data_files["train"] = data_args.train_file
            if data_args.validation_file is not None:
                data_files["validation"] = data_args.validation_file
            extension = data_args.train_file.split(".")[-1]
            if extension == "txt":
                extension = "text"
            datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)

            if "validation" not in datasets.keys():
                datasets["validation"] = load_dataset(
                    extension,
                    data_files=data_files,
                    split=f"train[:{data_args.validation_split_percentage}%]",
                    cache_dir=model_args.cache_dir,
                )
                datasets["train"] = load_dataset(
                    extension,
                    data_files=data_files,
                    split=f"train[{data_args.validation_split_percentage}%:]",
                    cache_dir=model_args.cache_dir,
                )
        # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
        # https://huggingface.co/docs/datasets/loading_datasets.html.

        # Load pretrained model and tokenizer

    if model_args.tokenizer_name:
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    elif model_args.model_name_or_path:
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.config_name:
        config = transformers.T5Config.from_pretrained(
            model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
        )

        if model_args.model_type != "t5":
            raise NotImplementedError

        config.decoder_start_token_id = config.pad_token_id
    elif model_args.model_name_or_path:
        config = transformers.T5Config.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = transformers.CONFIG_MAPPING[model_args.model_type]()
        if accelerator.is_local_main_process:
            logger.warning("You are instantiating a new config instance from scratch.")

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
    # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
    # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
    # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
    expanded_inputs_length, targets_length = compute_input_and_target_lengths(
        inputs_length=max_seq_length,
        noise_density=data_args.mlm_probability,
        mean_noise_span_length=data_args.mean_noise_span_length,
    )

    if data_args.dataset_pickle_path is None:
        if training_args.do_train:
            column_names = datasets["train"].column_names
        else:
            column_names = datasets["validation"].column_names
        text_column_name = "text" if "text" in column_names else column_names[0]

        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        # Since we make sure that all sequences are of the same length, no attention_mask is needed.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name], return_attention_mask=False, truncation=True)

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            if total_length >= expanded_inputs_length:
                total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
            # Split by chunks of max_len.
            result = {
                k: [t[i: i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if accelerator.is_local_main_process:
        wandb.init(project="T5_Pretraining", entity="frostbyte")
        wandb.config.update(training_args)
        wandb.config.update(model_args)
        wandb.config.update(data_args)
        wandb.config.update(config.to_dict())

    # Initialize our training
    if model_args.model_name_or_path:
        model = transformers.T5ForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path, config=config, seed=training_args.seed)
    else:
        config.vocab_size = len(tokenizer)
        model = transformers.T5ForConditionalGeneration(config)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": training_args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    if training_args.adafactor:
        optimizer = Adafactor(optimizer_grouped_parameters, lr=training_args.learning_rate,
                                           scale_parameter=False, relative_step=False)
    else:
        optimizer = transformers.AdamW(
            optimizer_grouped_parameters,
            lr=training_args.learning_rate,
            betas=(training_args.adam_beta1, training_args.adam_beta2),
            eps=training_args.adam_epsilon
        )

    optimizer.zero_grad()

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForT5MLM(
        tokenizer=tokenizer,
        noise_density=data_args.mlm_probability,
        mean_noise_span_length=data_args.mean_noise_span_length,
        input_length=max_seq_length,
        target_length=targets_length,
        pad_token_id=model.config.pad_token_id,
        decoder_start_token_id=model.config.decoder_start_token_id,
    )

    # Store some constant
    num_epochs = int(training_args.num_train_epochs)
    train_batch_size = int(training_args.per_device_train_batch_size)
    eval_batch_size = int(training_args.per_device_eval_batch_size)

    train_loader = torch.utils.data.DataLoader(tokenized_datasets["train"], shuffle=True,
                                               collate_fn=data_collator, batch_size=train_batch_size)
    eval_loader = torch.utils.data.DataLoader(tokenized_datasets["validation"], shuffle=False,
                                              collate_fn=data_collator, batch_size=eval_batch_size)

    # num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
    # scheduler = transformers.get_linear_schedule_with_warmup(optimizer, training_args.warmup_steps, num_train_steps)
    scheduler = NoamLR(optimizer, warmup_steps=training_args.warmup_steps)

    if model_args.model_resume_checkpoint is not None:
        if accelerator.is_local_main_process:
            logger.info("Resuming from checkpoint")

        checkpoint = torch.load(model_args.model_resume_checkpoint)
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        scheduler = checkpoint["scheduler"]
        resume_step = checkpoint["step"]
    else:
        resume_step = -1

    model, optimizer, train_loader, eval_loader = accelerator.prepare(model, optimizer, train_loader, eval_loader)

    # for epoch in range(num_epochs):
    assert num_epochs == 1
    epoch = 0
    # only the "total" since the last logging step
    total_train_loss = torch.tensor([0.0], device=accelerator.device, requires_grad=False)
    total_train_specialization_metric = torch.tensor([0.0], device=accelerator.device, requires_grad=False)
    total_num_examples = torch.tensor([0.0], device=accelerator.device, requires_grad=False)

    for step, batch in tqdm(enumerate(train_loader), desc="Training", total=len(train_loader),
                            disable=not accelerator.is_local_main_process):
        cur_step = epoch * len(train_loader) + step
        if cur_step <= resume_step:
            continue

        if cur_step % training_args.eval_steps == 0:  # and cur_step > 0:
            if (cur_step) % training_args.gradient_accumulation_steps != 0:
                if accelerator.is_local_main_process:
                    logger.info("Skipping evaluate because gradients are accumulated")

                continue
            eval_loss = torch.tensor([0.0], device=accelerator.device, requires_grad=False)
            eval_specialization_metric = torch.tensor([0.0], device=accelerator.device, requires_grad=False)
            eval_acc = torch.tensor([0.0], device=accelerator.device, requires_grad=False)

            model.eval()
            batch.to("cpu")
            for eval_batch in tqdm(eval_loader, desc="Evaluating", leave=False,
                                   disable=not accelerator.is_local_main_process):
                optimizer.zero_grad()
                loss, decoder_last_state, decoder_cache, decoder_states, decoder_attns, decoder_self_norms, \
                decoder_cross_norms, encoder_last_state, encoder_states, encoder_attns, encoder_norms = \
                    model(**eval_batch, output_hidden_states=True, output_attentions=True, output_norms=True)

                preds = torch.argmax(decoder_last_state, dim=-1).detach().cpu()
                acc = torch.eq(preds, eval_batch["labels"].cpu()).float().sum().to(accelerator.device)
                del preds

                batch_specialization_metric, batch_size = compute_specialization_metric(norms_to_tensor(encoder_norms), accelerator.device)
                del encoder_norms

                eval_loss += loss.detach()
                eval_acc += acc / targets_length
                eval_specialization_metric += batch_specialization_metric
                del batch_specialization_metric, batch_size, loss, acc

            num_eval_examples = len(tokenized_datasets["validation"])
            avg_eval_loss =  accelerator.gather(eval_loss).mean().item() / len(eval_loader)
            avg_eval_specialization_metric = accelerator.gather(eval_specialization_metric).sum().item() / num_eval_examples
            avg_eval_acc = accelerator.gather(eval_acc).sum().item() / num_eval_examples

            if accelerator.is_local_main_process:
                wandb.log({
                    "eval_loss": avg_eval_loss,
                    "eval_specialization_metric": avg_eval_specialization_metric,
                    "eval_acc": avg_eval_acc,
                }, step=cur_step * 2)  # TODO: don't hardcode, multiply by num processes

                del eval_loss, eval_acc, eval_specialization_metric

            batch.to(accelerator.device)

            optimizer.zero_grad()

        model.train()
        loss, decoder_last_state, decoder_cache, decoder_states, decoder_attns, decoder_self_norms, \
            decoder_cross_norms, encoder_last_state, encoder_states, encoder_attns, encoder_norms = \
            model(**batch, output_hidden_states=True, output_attentions=True, output_norms=True)

        accelerator.backward(loss)

        if (cur_step + 1) % training_args.gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        batch_specialization_metric, batch_size = compute_specialization_metric(norms_to_tensor(encoder_norms), device=accelerator.device)

        total_train_loss += loss.detach()
        total_train_specialization_metric += batch_specialization_metric
        total_num_examples += batch_size

        del loss, batch_specialization_metric, batch_size

        if cur_step % training_args.logging_steps == 0 and cur_step > 0:
            avg_train_loss = accelerator.gather(total_train_loss).mean().item() / training_args.logging_steps
            avg_train_specialization_metric = accelerator.gather(total_train_specialization_metric).mean().item() \
                                              / accelerator.gather(total_num_examples).mean().item()
            if accelerator.is_local_main_process:
                wandb.log({
                    "train_loss": avg_train_loss,
                    "train_specialization_metric": avg_train_specialization_metric,
                    "learning_rate": scheduler.get_last_lr()[0],
                }, step=cur_step * 2)  # TODO: don't hardcode, multiply by num processes

            total_train_loss[0] = 0.0
            total_train_specialization_metric[0] = 0.0
            total_num_examples[0] = 0.0

        if cur_step % training_args.save_steps == 0 and cur_step > 0 and accelerator.is_local_main_process:
            checkpoint = {
                "step": cur_step,
                "model": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler
            }
            accelerator.save(checkpoint, f"{training_args.output_dir}/checkpoint_{cur_step // training_args.save_steps}.pt")