Esempio n. 1
0
    def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
        """
        Load a language model either by supplying

        * the name of a remote model on s3 ("roberta-base" ...)
        * or a local path of a model trained via transformers ("some_dir/huggingface_model")
        * or a local path of a model trained via FARM ("some_dir/farm_model")

        :param pretrained_model_name_or_path: name or path of a model
        :param language: (Optional) Name of language the model was trained for (e.g. "german").
                         If not supplied, FARM will try to infer it from the model name.
        :return: Language Model

        """
        roberta = cls()
        if "farm_lm_name" in kwargs:
            roberta.name = kwargs["farm_lm_name"]
        else:
            roberta.name = pretrained_model_name_or_path
        # We need to differentiate between loading model using FARM format and Pytorch-Transformers format
        farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json"
        if os.path.exists(farm_lm_config):
            # FARM style
            config = RobertaConfig.from_pretrained(farm_lm_config)
            farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin"
            roberta.model = RobertaModel.from_pretrained(farm_lm_model, config=config, **kwargs)
            roberta.language = roberta.model.config.language
        else:
            # Huggingface transformer Style
            roberta.model = RobertaModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs)
            roberta.language = cls._infer_language_from_name(pretrained_model_name_or_path)
        return roberta
Esempio n. 2
0
 def __init__(self, bert_dir):
     super().__init__()
     self.bert_config = RobertaConfig.from_pretrained(bert_dir, output_hidden_states=False)
     self.intermediate = RobertaModel.from_pretrained(bert_dir)
     self.span_info_collect = SICModel(self.bert_config.hidden_size)
     self.interpretation = InterpretationModel(self.bert_config.hidden_size)
     self.output = nn.Linear(self.bert_config.hidden_size, self.bert_config.num_labels)
Esempio n. 3
0
def convert_pytorch_to_roberta_checkpoint(pytorch_checkpoint_path: str,
                                          roberta_dump_folder_path: str):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    import pickle
    model = RobertaForMaskedLM.from_pretrained(pytorch_checkpoint_path)
    config = RobertaConfig.from_pretrained(pytorch_checkpoint_path)
    from argparse import Namespace

    huggingface_train_args = Namespace(
        **vars(torch.load(f"{pytorch_checkpoint_path}/training_args.bin")))
    model.eval()  # disable dropout

    # tokenizer = RobertaTokenizer.from_pretrained(roberta_checkpoint_path)
    if config.num_hidden_layers == 12:
        roberta = FairseqRobertaModel.from_pretrained("roberta.base")
    elif config.num_hidden_layers == 24:
        roberta = FairseqRobertaModel.from_pretrained("roberta.large")
    else:
        raise Exception("Only roberta LM is supported!")
    roberta.eval()
    # roberta_sent_encoder = roberta.model.decoder.sentence_encoder

    # update config from huggingface and reuse lots of settings from fairseq pretrained
    roberta.args.warmup_updates = huggingface_train_args.warmup_steps
    roberta.args.weight_decay = huggingface_train_args.weight_decay
    roberta.args.adam_eps = huggingface_train_args.adam_epsilon
    roberta.args.clip_norm = huggingface_train_args.max_grad_norm
    roberta.args.max_update = huggingface_train_args.max_steps
    roberta.args.total_num_update = huggingface_train_args.max_steps
    roberta.args.save_interval_updates = huggingface_train_args.save_steps

    roberta.args.attention_dropout = config.attention_probs_dropout_prob
    roberta.args.encoder_embed_dim = config.hidden_size
    roberta.args.encoder_ffn_embed_dim = config.intermediate_size
    roberta.args.activation_fn = config.hidden_act
    roberta.args.activation_dropout = config.hidden_dropout_prob
    roberta.args.encoder_layers = config.num_hidden_layers
    roberta.args.encoder_attention_heads = config.num_attention_heads
    roberta.args.__dict__.update(huggingface_train_args.__dict__)

    roberta.model.decoder.sentence_encoder.embed_tokens.weight = model.roberta.embeddings.word_embeddings.weight
    roberta.model.decoder.sentence_encoder.embed_positions.weight = model.roberta.embeddings.position_embeddings.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.
    roberta.model.decoder.sentence_encoder.emb_layer_norm.weight = model.roberta.embeddings.LayerNorm.weight
    roberta.model.decoder.sentence_encoder.emb_layer_norm.bias = model.roberta.embeddings.LayerNorm.bias

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        layer: BertLayer = model.roberta.encoder.layer[i]
        # roberta.model.decoder.sentence_encoder.layers[i]: TransformerSentenceEncoderLayer = roberta.model.decoder.sentence_encoder.layers[i]

        # self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert (roberta.model.decoder.sentence_encoder.layers[i].self_attn.
                k_proj.weight.data.shape == roberta.model.decoder.
                sentence_encoder.layers[i].self_attn.q_proj.weight.data.shape
                == roberta.model.decoder.sentence_encoder.layers[i].self_attn.
                v_proj.weight.data.shape == torch.Size(
                    (config.hidden_size, config.hidden_size)))

        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.q_proj.weight = self_attn.query.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.q_proj.bias = self_attn.query.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.k_proj.weight = self_attn.key.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.k_proj.bias = self_attn.key.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.v_proj.weight = self_attn.value.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.v_proj.bias = self_attn.value.bias

        # self-attention output
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.weight = self_output.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.bias = self_output.dense.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn_layer_norm.weight = self_output.LayerNorm.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn_layer_norm.bias = self_output.LayerNorm.bias

        # intermediate
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].fc1.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].fc1.weight = intermediate.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].fc1.bias = intermediate.dense.bias

        # output
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].fc2.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].fc2.weight = bert_output.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].fc2.bias = bert_output.dense.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].final_layer_norm.weight = bert_output.LayerNorm.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].final_layer_norm.bias = bert_output.LayerNorm.bias

    # LM Head
    roberta.model.decoder.lm_head.dense.weight = model.lm_head.dense.weight
    roberta.model.decoder.lm_head.dense.bias = model.lm_head.dense.bias
    roberta.model.decoder.lm_head.layer_norm.weight = model.lm_head.layer_norm.weight
    roberta.model.decoder.lm_head.layer_norm.bias = model.lm_head.layer_norm.bias
    roberta.model.decoder.lm_head.weight = model.lm_head.decoder.weight
    roberta.model.decoder.lm_head.bias = model.lm_head.decoder.bias

    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(
        0)  # batch of size 1
    their_output = model(input_ids)[0]
    our_output = roberta.model(input_ids)[0]

    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    copy_success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if copy_success else "💩")
    if not copy_success:
        raise Exception("Something went wRoNg")

    pathlib.Path(roberta_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {roberta_dump_folder_path}")
    from fairseq import checkpoint_utils
    state_dict = {
        "args":
        roberta.args,
        "model":
        roberta.model.state_dict(),
        # these last two were copied from fairseq pretrained just to make .from_pretrain() function works
        "extra_state": {
            'train_iterator': {
                'epoch': 0
            },
            'val_loss': 1.4955725940408326
        },
        "optimizer_history": [{
            'criterion_name': 'MaskedLmLoss',
            'optimizer_name': 'MemoryEfficientFP16Optimizer',
            'lr_scheduler_state': {
                'best': 1.495530066777925
            },
            'num_updates': 500000
        }]
    }
    from fairseq import checkpoint_utils
    # checkpoint_utils.save_state(f"{roberta_dump_folder_path}/model.pt", roberta.args, roberta.state_dict(), )
    # del model
    checkpoint_utils.torch_persistent_save(
        state_dict, f"{roberta_dump_folder_path}/model.pt")
    loaded_model = FairseqRobertaModel.from_pretrained(
        roberta_dump_folder_path)
    loaded_model.eval()

    # roberta.model(input_ids)
    # loaded_model.model(input_ids)

    del state_dict
    copied_dict = roberta.state_dict()
    loaded_dict = loaded_model.state_dict()
    assert loaded_model.state_dict().keys() == roberta.state_dict().keys()
    for k in roberta.state_dict().keys():
        loaded_val = loaded_dict[k]
        copied_val = copied_dict[k]
        if not torch.allclose(loaded_val, copied_val, atol=1e-3):
            print(k)
    loaded_output = loaded_model.model(input_ids)[0]
    save_success = torch.allclose(our_output, loaded_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if save_success else "💩")
    if not save_success:
        raise Exception("Something went wRoNg")
    # except:
    #     print("Fail to save")
    # torch.save(roberta, f"{roberta_dump_folder_path}/model.pt")
    print("Done")
Esempio n. 4
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_neg_data', type=Path, required=True)
    parser.add_argument('--pregenerated_pos_data', type=Path, required=True)
    parser.add_argument('--validation_neg_data', type=Path, required=True)
    parser.add_argument('--validation_pos_data', type=Path, required=True)
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument('--exp_group', type=str, required=True)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--method",
                        type=str,
                        choices=[
                            'neg_samebatch', 'distill_samebatch',
                            'distill_samebatch_lstm', 'distill', 'kl',
                            'unlikelihood'
                        ])
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument("--save_before", action='store_true')
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--max_seq_len", default=512, type=int)

    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--port_idx", type=int)

    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--valid_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--kr_freq", default=0.0, type=float)
    parser.add_argument("--mlm_freq", default=0, type=float)
    parser.add_argument("--kl_w", default=1000, type=float)
    parser.add_argument("--ul_w", default=1, type=float)
    parser.add_argument("--gamma",
                        default=0.5,
                        type=float,
                        help="coeff of UL and 1-coeff of LL")
    parser.add_argument('--no_mlm',
                        action='store_true',
                        help="don't do any MLM training")
    parser.add_argument("--no_tie",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--no_ul',
                        action='store_true',
                        help="don't do any UL training")
    parser.add_argument('--no_ll',
                        action='store_true',
                        help="don't do any LL training")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--learning_rate",
                        default=1e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()





    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        print(torch.cuda.is_available())
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
        print("Num of gpus: ", n_gpu)
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        print("GPU Device: ", device)
        n_gpu = 1
        dist_comms.init_distributed_training(args.local_rank, args.port_idx)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    logging.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

    pt_output = Path(getenv('PT_OUTPUT_DIR', ''))
    args.output_dir = Path(os.path.join(pt_output, args.output_dir))

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    if args.bert_model != "roberta-base":
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)
        tokenizer.vocab = tokenizer.encoder

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    if args.bert_model != "roberta-base":
        if args.method == "neg_samebatch":
            config = BertConfig.from_pretrained(args.bert_model)
            config.bert_model = args.bert_model
            core_model = BertForNegSameBatch.from_pretrained(args.bert_model,
                                                             args.gamma,
                                                             config=config)
            core_model.init_orig_bert()
        elif args.method == "unlikelihood":
            config = BertConfig.from_pretrained(args.bert_model)
            core_model = BertForNegPreTraining.from_pretrained(args.bert_model,
                                                               config=config)
        else:
            raise NotImplementedError(
                f"method {args.method} is not implemented")
    else:
        config = RobertaConfig.from_pretrained(args.bert_model)
        core_model = RobertaForNegPreTraining.from_pretrained(args.bert_model)

    core_model = core_model.to(device)

    # Prepare optimizer
    param_optimizer = list(core_model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        core_model, optimizer = amp.initialize(core_model,
                                               optimizer,
                                               opt_level=args.fp16_opt_level)

    model = torch.nn.parallel.DistributedDataParallel(
        core_model,
        device_ids=[args.local_rank],
        output_device=args.local_rank,
        find_unused_parameters=True)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()

    if args.local_rank == 0 or args.local_rank == -1:
        if args.save_before:
            before_train_path = Path(
                os.path.join(args.output_dir, "before_training"))
            print("Before training path: ", before_train_path)
            before_train_path.mkdir(parents=True, exist_ok=True)
            model.module.save_pretrained(
                os.path.join(args.output_dir, "before_training"))
            tokenizer.save_pretrained(
                os.path.join(args.output_dir, "before_training"))

        # writer = SummaryWriter(log_dir=args.output_dir)
        wandb.init(project="neg_v2",
                   name=str(args.output_dir).split("/")[-1],
                   group=args.exp_group,
                   entity='negation')
        mlm_averagemeter = AverageMeter()
        ul_averagemeter = AverageMeter()
        ll_averagemeter = AverageMeter()
        kl_averagemeter = AverageMeter()

    neg_epoch_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.pregenerated_neg_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)

    pos_epoch_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.pregenerated_pos_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)

    neg_validation_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.validation_neg_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)
    pos_validation_dataset = PregeneratedDataset(
        epoch=0,
        training_path=args.validation_pos_data,
        tokenizer=tokenizer,
        num_data_epochs=num_data_epochs,
        reduce_memory=args.reduce_memory)

    if args.local_rank == -1:
        neg_train_sampler = RandomSampler(neg_epoch_dataset)
        pos_train_sampler = RandomSampler(pos_epoch_dataset)

        neg_valid_sampler = RandomSampler(neg_validation_dataset)
        pos_valid_sampler = RandomSampler(pos_validation_dataset)
    else:
        neg_train_sampler = DistributedSampler(neg_epoch_dataset)
        pos_train_sampler = DistributedSampler(pos_epoch_dataset)

        neg_valid_sampler = DistributedSampler(neg_validation_dataset)
        pos_valid_sampler = DistributedSampler(pos_validation_dataset)

    neg_train_dataloader = DataLoader(neg_epoch_dataset,
                                      sampler=neg_train_sampler,
                                      batch_size=args.train_batch_size)
    pos_train_dataloader = DataLoader(pos_epoch_dataset,
                                      sampler=pos_train_sampler,
                                      batch_size=args.train_batch_size)

    neg_valid_dataloader = DataLoader(neg_validation_dataset,
                                      sampler=neg_valid_sampler,
                                      batch_size=args.valid_batch_size)
    pos_valid_dataloader = DataLoader(pos_validation_dataset,
                                      sampler=pos_valid_sampler,
                                      batch_size=args.valid_batch_size)

    def inf_train_gen():
        while True:
            for kr_step, kr_batch in enumerate(neg_train_dataloader):
                yield kr_step, kr_batch

    kr_gen = inf_train_gen()

    def pos_inf_train_gen():
        while True:
            for kr_step, kr_batch in enumerate(pos_train_dataloader):
                yield kr_step, kr_batch

    pos_kr_gen = pos_inf_train_gen()

    mlm_loss, neg_loss = 0, 0
    mlm_nb_it, neg_nb_it = 1, 1
    mlm_nb_ex, neg_nb_ex = 0, 0

    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        ul_tr_loss = 0
        nb_ul_tr_examples, nb_ul_tr_steps = 0, 1
        ll_tr_loss = 0
        nb_ll_tr_examples, nb_ll_tr_steps = 0, 1
        kl_tr_loss = 0
        nb_kl_tr_examples, nb_kl_tr_steps = 0, 1

        if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1
                                                   and args.local_rank == 0):
            logging.info("** ** * Saving fine-tuned model ** ** * ")
            model.module.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                if not args.no_mlm and (random.random() > args.mlm_freq):
                    model.train()
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, lm_label_ids = batch

                    outputs = model(input_ids=input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=segment_ids,
                                    masked_lm_labels=lm_label_ids,
                                    negated=False)

                    loss = outputs[1]
                    loss_dict = outputs[0]
                    mlm_loss += loss_dict['mlm'].item()

                    mlm_nb_it += 1
                    mlm_nb_ex += input_ids.size(0)

                    if n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.

                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    tr_loss += loss.item()

                    if args.local_rank == 0 or args.local_rank == -1:
                        mlm_averagemeter.update(loss_dict['mlm'].item())
                        # writer.add_scalar('MLM/train', loss_dict['mlm'].item(), mlm_nb_it)
                        wandb.log({'MLM/train': loss_dict['mlm'].item()})

                        nb_tr_steps += 1
                        nb_ll_tr_steps += 1
                        mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps

                        pbar.set_postfix_str(
                            f"MLM: {mlm_averagemeter:.6f}, UL: {ul_averagemeter:.6f}, LL: {ll_averagemeter:.6f}, KL: {kl_averagemeter:.6f}"
                        )

                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        scheduler.step()  # Update learning rate schedule
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1
                pbar.update(1)
                random_num = random.random()
                if random_num > args.kr_freq:
                    if args.method in ["neg_samebatch"]:
                        ul_step, ul_batch = next(kr_gen)
                        ul_batch = tuple(t.to(device) for t in ul_batch)
                        ul_input_ids, ul_input_mask, ul_segment_ids, ul_lm_label_ids = ul_batch

                        ll_step, ll_batch = next(pos_kr_gen)
                        ll_batch = tuple(t.to(device) for t in ll_batch)
                        ll_input_ids, ll_input_mask, ll_segment_ids, ll_lm_label_ids = ll_batch

                        batch_mask = torch.zeros(
                            (ul_input_ids.size(0) + ll_input_ids.size(0)),
                            dtype=ll_input_mask.dtype,
                            device=device)
                        batch_mask[:ul_input_ids.size(0)] = 1.

                        outputs = model(
                            input_ids=torch.cat([ul_input_ids, ll_input_ids],
                                                0),
                            attention_mask=torch.cat(
                                [ul_input_mask, ll_input_mask], 0),
                            token_type_ids=torch.cat(
                                [ul_segment_ids, ll_segment_ids], 0),
                            masked_lm_labels=torch.cat(
                                [ul_lm_label_ids, ll_lm_label_ids], 0),
                            negated=True,
                            batch_neg_mask=batch_mask)

                        loss = outputs[1] * args.ul_w
                        loss_dict = outputs[0]

                        if args.local_rank == 0 or args.local_rank == -1:
                            wandb.log({
                                'UL/train': loss_dict['neg'].item(),
                                'LL/train': loss_dict['pos'].item()
                            })
                            ul_averagemeter.update(loss_dict['neg'].item())
                            ll_averagemeter.update(loss_dict['pos'].item())
                        neg_nb_it += 1

                    elif random.random() > 0.5 and not args.no_ul:
                        kr_step, kr_batch = next(kr_gen)
                        kr_batch = tuple(t.to(device) for t in kr_batch)
                        input_ids, input_mask, segment_ids, lm_label_ids = kr_batch

                        outputs = model(input_ids=input_ids,
                                        attention_mask=input_mask,
                                        token_type_ids=segment_ids,
                                        masked_lm_labels=lm_label_ids,
                                        negated=True)

                        loss = outputs[1] * args.ul_w

                        loss_dict = outputs[0]
                        nb_ul_tr_steps += 1

                        neg_loss += loss_dict['neg'].item()
                        if args.local_rank == 0 or args.local_rank == -1:
                            wandb.log({
                                'UL/train':
                                loss_dict['neg'].item(),
                                'KL/train':
                                loss_dict['kl'].item() * args.kl_w
                            })
                            ul_averagemeter.update(loss_dict['neg'].item())
                            kl_averagemeter.update(loss_dict['kl'].item() *
                                                   args.kl_w)

                        neg_nb_it += 1
                    elif not args.no_ll:
                        kr_step, kr_batch = next(pos_kr_gen)
                        kr_batch = tuple(t.to(device) for t in kr_batch)
                        input_ids, input_mask, segment_ids, lm_label_ids = kr_batch

                        outputs = model(input_ids=input_ids,
                                        attention_mask=input_mask,
                                        token_type_ids=segment_ids,
                                        masked_lm_labels=lm_label_ids,
                                        negated=False)
                        loss = outputs[1]
                        loss_dict = outputs[0]
                        nb_ll_tr_steps += 1

                        mlm_loss += loss_dict['mlm'].item()

                        mlm_nb_it += 1
                        if args.local_rank == 0 or args.local_rank == -1:
                            wandb.log({'LL/train': loss_dict['mlm'].item()})
                            ll_averagemeter.update(loss_dict['mlm'].item())

                        mlm_nb_ex += input_ids.size(0)
                    else:
                        continue

                    if n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    tr_loss += loss.item()
                    if args.local_rank == 0 or args.local_rank == -1:
                        nb_tr_steps += 1
                        mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                        pbar.set_postfix_str(
                            f"MLM: {mlm_averagemeter:.6f}, UL: {ul_averagemeter:.6f}, LL: {ll_averagemeter:.6f}, KL: {kl_averagemeter:.6f}"
                        )
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        scheduler.step()  # Update learning rate schedule
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1
                if n_gpu > 1 and args.local_rank == -1 or (
                        n_gpu <= 1 and args.local_rank == 0):
                    if False and (step + 1) % 100 == 0:
                        neg_valid_res = validate(
                            model=model,
                            dataloader=neg_valid_dataloader,
                            device=device,
                            negated=True)
                        pos_valid_res = validate(
                            model=model,
                            dataloader=pos_valid_dataloader,
                            device=device,
                            negated=False)
                        wandb.log({
                            'neg/valid/p@1': neg_valid_res % 100.,
                            'pos/valid/p@1': pos_valid_res % 100.
                        })

    # Save a trained model
    if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1
                                               and args.local_rank == 0):
        print("Saving model")
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model.module.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        print(str(wandb.run.id))
        pickle.dump(
            str(wandb.run.id),
            open(os.path.join(args.output_dir, 'wandb_run_id.pkl'), 'wb'))
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model: str = None,
                 requires_grad: bool = True,
                 transformer_weights_model: str = None,
                 reset_classifier: bool = False,
                 binary_loss: bool = False,
                 layer_freeze_regexes: List[str] = None,
                 on_load: bool = False,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        if on_load:
            logging.info(f"Skipping loading of initial Transformer weights")
            transformer_config = RobertaConfig.from_pretrained(
                pretrained_model)
            self._transformer_model = RobertaModel(transformer_config)

        elif transformer_weights_model:
            logging.info(
                f"Loading Transformer weights model from {transformer_weights_model}"
            )
            transformer_model_loaded = load_archive(transformer_weights_model)
            self._transformer_model = transformer_model_loaded.model._transformer_model
        else:
            self._transformer_model = RobertaModel.from_pretrained(
                pretrained_model)

        for name, param in self._transformer_model.named_parameters():
            grad = requires_grad
            if layer_freeze_regexes and grad:
                grad = not any(
                    [bool(re.search(r, name)) for r in layer_freeze_regexes])
            param.requires_grad = grad

        transformer_config = self._transformer_model.config

        self._output_dim = transformer_config.hidden_size
        classifier_input_dim = self._output_dim
        classifier_output_dim = 1
        transformer_config.num_labels = classifier_output_dim
        self._classifier = None
        if not on_load and transformer_weights_model \
                and hasattr(transformer_model_loaded.model, "_classifier") \
                and not reset_classifier:
            self._classifier = transformer_model_loaded.model._classifier
            old_dims = (self._classifier.dense.in_features,
                        self._classifier.out_proj.out_features)
            new_dims = (classifier_input_dim, classifier_output_dim)
            if old_dims != new_dims:
                logging.info(
                    f"NOT copying Transformer classifier weights, incompatible dims: {old_dims} vs {new_dims}"
                )
                self._classifier = None
        if self._classifier is None:
            self._classifier = RobertaClassificationHead(transformer_config)

        self._binary_loss = binary_loss
        self._accuracy = CategoricalAccuracy()
        self._sigmoid = torch.nn.Sigmoid()
        if self._binary_loss:
            self._loss = torch.nn.BCEWithLogitsLoss()
        else:
            self._loss = torch.nn.CrossEntropyLoss()
        self._debug = 2
        self._padding_value = 1  # The index of the RoBERTa padding token
Esempio n. 6
0
 def init_encoder(cls, cfg_name: str, projection_dim: int = 0, dropout: float = 0.1, **kwargs) -> BertModel:
     cfg = RobertaConfig.from_pretrained(cfg_name if cfg_name else 'roberta-base')
     if dropout != 0:
         cfg.attention_probs_dropout_prob = dropout
         cfg.hidden_dropout_prob = dropout
     return cls.from_pretrained(cfg_name, config=cfg, project_dim=projection_dim, **kwargs)