Ejemplo n.º 1
0
 def save_best_averaged_checkpoint(self, args, trainer, extra_state: Dict[str, Any]):
     """
     save() should always be called before calling this function - to ensure
     that extra_state and self._averaged_params have been updated correctly.
     """
     best_averaged_checkpoint_filename = os.path.join(
         args.save_dir, constants.AVERAGED_CHECKPOINT_BEST_FILENAME
     )
     self.log_if_verbose(
         f"| Preparing to save new best averaged checkpoint to "
         f"{best_averaged_checkpoint_filename}."
     )
     state_dict = trainer.state_dict()
     state_dict["args"] = args
     state_dict["cfg"] = None
     state_dict["model"] = self._averaged_params
     state_dict["extra_state"].update(extra_state)
     state_dict = fairseq_utils.move_to_cpu(state_dict)
     checkpoint_utils.torch_persistent_save(
         obj=state_dict,
         filename=best_averaged_checkpoint_filename,
     )
     self.log_if_verbose(
         f"| Finished saving new best averaged checkpoint to "
         f"{best_averaged_checkpoint_filename}."
     )
Ejemplo n.º 2
0
    def test_torch_persistent_save_async(self):
        state_dict = {}
        filename = "async_checkpoint.pt"

        with patch(f"{checkpoint_utils.__name__}.PathManager.opena"
                   ) as mock_opena:
            with patch(f"{checkpoint_utils.__name__}._torch_persistent_save"
                       ) as mock_save:
                checkpoint_utils.torch_persistent_save(state_dict,
                                                       filename,
                                                       async_write=True)
                mock_opena.assert_called_with(filename, "wb")
                mock_save.assert_called()
Ejemplo n.º 3
0
    def test_torch_persistent_save_async(self):
        cfg = OmegaConf.create()
        cfg.dataset = OmegaConf.create()
        cfg.dataset.write_checkpoints_asynchronously = True
        state_dict = {}
        filename = "async_checkpoint.pt"

        with patch(f"{checkpoint_utils.__name__}.PathManager.opena"
                   ) as mock_opena:
            with patch(f"{checkpoint_utils.__name__}._torch_persistent_save"
                       ) as mock_save:
                checkpoint_utils.torch_persistent_save(cfg.dataset, state_dict,
                                                       filename)
                mock_opena.assert_called_with(filename, "wb")
                mock_save.assert_called()
Ejemplo n.º 4
0
def split_create(model, 
                source_path = "/home/wannabe/Documents/ufal-transformer-big/transformer_checkpoints/checkpoint_last.pt", 
                target_path = "/home/wannabe/Documents/ufal-transformer-big/encoder_checkpoints/checkpoint_last.pt"
            ):
    """
    Args:
        source_path: A fairseq.Language_model that whose params will be initialized with the params
                    from the Transformer model.
        target_path: A fairseq.Transformer model that has been trained on the Translation task
        modified_path: A string object denoting the path to where you wish to store the model
    """

    #check if the file exists, it it does return
    if os.path.isfile(target_path):
        #print ("Inside the if clause")
        return 

    extended_list = []
    for key in model.state_dict().keys():
        if key.startswith('encoder.layer_norm') or key.startswith('out_layer'):
            extended_list.append((key, model.state_dict()[key]))

    translation_state = checkpoint_utils.load_checkpoint_to_cpu(source_path)

    #filtered state has the encoder parts of the translation model
    filtered_state = []

    for key in translation_state['model'].keys():
        if key.startswith('encoder'):
            filtered_state.append((key, translation_state['model'][key]))

    filtered_state.extend(extended_list)

    list_encoder_state_dict = OrderedDict(filtered_state)
    translation_state['model'] = list_encoder_state_dict

    #save the encoder weights of the translation model at the target path 
    checkpoint_utils.torch_persistent_save(translation_state, target_path)

    return 
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--checkpoint",
                        type=str,
                        required=True,
                        help="Wav2Vec checkpoint to be prepared")
    args = parser.parse_args()

    ckpt = load_checkpoint_to_cpu(args.checkpoint)

    ckpt['args'] = None
    ckpt['cfg'] = ckpt['cfg']['model']['w2v_args']

    for key in list(ckpt['model'].keys()):
        w = ckpt['model'].pop(key)
        if key.startswith('w2v_encoder.w2v_model.'):
            new_key = key.replace('w2v_encoder.w2v_model.', '')
            ckpt['model'][new_key] = w

    # These are Wav2Vec2 parameters which will be removed in Wav2VecEncoder
    if 'small' in args.checkpoint.split('/')[-1]:
        ckpt['model']['quantizer.vars'] = torch.randn(1, 640, 128)
        ckpt['model']['quantizer.weight_proj.weight'] = torch.randn(640, 512)
        ckpt['model']['quantizer.weight_proj.bias'] = torch.randn(640)
        ckpt['model']['project_q.weight'] = torch.randn(256, 256)
        ckpt['model']['project_q.bias'] = torch.randn(256)
        ckpt['model']['final_proj.weight'] = torch.randn(256, 768)
        ckpt['model']['final_proj.bias'] = torch.randn(256)
    else:
        ckpt['model']['quantizer.vars'] = torch.randn(1, 640, 384)
        ckpt['model']['quantizer.weight_proj.weight'] = torch.randn(640, 512)
        ckpt['model']['quantizer.weight_proj.bias'] = torch.randn(640)
        ckpt['model']['project_q.weight'] = torch.randn(768, 768)
        ckpt['model']['project_q.bias'] = torch.randn(768)
        ckpt['model']['final_proj.weight'] = torch.randn(768, 1024)
        ckpt['model']['final_proj.bias'] = torch.randn(768)

    torch_persistent_save(ckpt, args.checkpoint)
Ejemplo n.º 6
0
def init_tmodel(source_path, target_path, modified_path):
    """
    Args:
        source_path: A fairseq.Language_model that whose params will be initialized with the params
                    from the Transformer model.
        target_path: A fairseq.Transformer model that has been trained on the Translation task
        modified_path: A string object denoting the path to where you wish to store the model
    """
    encoder_state = checkpoint_utils.load_checkpoint_to_cpu(source_path)
    translation_state = checkpoint_utils.load_checkpoint_to_cpu(target_path)

    filtered_state = []

    for key in encoder_state['model'].keys():
        filtered_state.append((key, encoder_state['model'][key]))

    #Remove the linear and layer norm layers to maintain compatibiility
    filtered_state.pop()
    filtered_state.pop()
    filtered_state.pop()
    filtered_state.pop()

    list_translation_state = []

    for key in translation_state['model'].keys():
        list_translation_state.append((key, translation_state['model'][key]))

    for index, key in enumerate(list_translation_state):
      if key[0].startswith('encoder'):
        list_translation_state[index] = filtered_state[index]

    list_translation_state_dict = OrderedDict(list_translation_state)
    translation_state['model'] = list_translation_state_dict
    
    checkpoint_utils.torch_persistent_save(translation_state, modified_path)

    return 
Ejemplo n.º 7
0
def convert_pytorch_to_roberta_checkpoint(pytorch_checkpoint_path: str,
                                          roberta_dump_folder_path: str):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    import pickle
    model = RobertaForMaskedLM.from_pretrained(pytorch_checkpoint_path)
    config = RobertaConfig.from_pretrained(pytorch_checkpoint_path)
    from argparse import Namespace

    huggingface_train_args = Namespace(
        **vars(torch.load(f"{pytorch_checkpoint_path}/training_args.bin")))
    model.eval()  # disable dropout

    # tokenizer = RobertaTokenizer.from_pretrained(roberta_checkpoint_path)
    if config.num_hidden_layers == 12:
        roberta = FairseqRobertaModel.from_pretrained("roberta.base")
    elif config.num_hidden_layers == 24:
        roberta = FairseqRobertaModel.from_pretrained("roberta.large")
    else:
        raise Exception("Only roberta LM is supported!")
    roberta.eval()
    # roberta_sent_encoder = roberta.model.decoder.sentence_encoder

    # update config from huggingface and reuse lots of settings from fairseq pretrained
    roberta.args.warmup_updates = huggingface_train_args.warmup_steps
    roberta.args.weight_decay = huggingface_train_args.weight_decay
    roberta.args.adam_eps = huggingface_train_args.adam_epsilon
    roberta.args.clip_norm = huggingface_train_args.max_grad_norm
    roberta.args.max_update = huggingface_train_args.max_steps
    roberta.args.total_num_update = huggingface_train_args.max_steps
    roberta.args.save_interval_updates = huggingface_train_args.save_steps

    roberta.args.attention_dropout = config.attention_probs_dropout_prob
    roberta.args.encoder_embed_dim = config.hidden_size
    roberta.args.encoder_ffn_embed_dim = config.intermediate_size
    roberta.args.activation_fn = config.hidden_act
    roberta.args.activation_dropout = config.hidden_dropout_prob
    roberta.args.encoder_layers = config.num_hidden_layers
    roberta.args.encoder_attention_heads = config.num_attention_heads
    roberta.args.__dict__.update(huggingface_train_args.__dict__)

    roberta.model.decoder.sentence_encoder.embed_tokens.weight = model.roberta.embeddings.word_embeddings.weight
    roberta.model.decoder.sentence_encoder.embed_positions.weight = model.roberta.embeddings.position_embeddings.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.
    roberta.model.decoder.sentence_encoder.emb_layer_norm.weight = model.roberta.embeddings.LayerNorm.weight
    roberta.model.decoder.sentence_encoder.emb_layer_norm.bias = model.roberta.embeddings.LayerNorm.bias

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        layer: BertLayer = model.roberta.encoder.layer[i]
        # roberta.model.decoder.sentence_encoder.layers[i]: TransformerSentenceEncoderLayer = roberta.model.decoder.sentence_encoder.layers[i]

        # self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert (roberta.model.decoder.sentence_encoder.layers[i].self_attn.
                k_proj.weight.data.shape == roberta.model.decoder.
                sentence_encoder.layers[i].self_attn.q_proj.weight.data.shape
                == roberta.model.decoder.sentence_encoder.layers[i].self_attn.
                v_proj.weight.data.shape == torch.Size(
                    (config.hidden_size, config.hidden_size)))

        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.q_proj.weight = self_attn.query.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.q_proj.bias = self_attn.query.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.k_proj.weight = self_attn.key.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.k_proj.bias = self_attn.key.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.v_proj.weight = self_attn.value.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.v_proj.bias = self_attn.value.bias

        # self-attention output
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.weight = self_output.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.bias = self_output.dense.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn_layer_norm.weight = self_output.LayerNorm.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn_layer_norm.bias = self_output.LayerNorm.bias

        # intermediate
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].fc1.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].fc1.weight = intermediate.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].fc1.bias = intermediate.dense.bias

        # output
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].fc2.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].fc2.weight = bert_output.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].fc2.bias = bert_output.dense.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].final_layer_norm.weight = bert_output.LayerNorm.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].final_layer_norm.bias = bert_output.LayerNorm.bias

    # LM Head
    roberta.model.decoder.lm_head.dense.weight = model.lm_head.dense.weight
    roberta.model.decoder.lm_head.dense.bias = model.lm_head.dense.bias
    roberta.model.decoder.lm_head.layer_norm.weight = model.lm_head.layer_norm.weight
    roberta.model.decoder.lm_head.layer_norm.bias = model.lm_head.layer_norm.bias
    roberta.model.decoder.lm_head.weight = model.lm_head.decoder.weight
    roberta.model.decoder.lm_head.bias = model.lm_head.decoder.bias

    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(
        0)  # batch of size 1
    their_output = model(input_ids)[0]
    our_output = roberta.model(input_ids)[0]

    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    copy_success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if copy_success else "💩")
    if not copy_success:
        raise Exception("Something went wRoNg")

    pathlib.Path(roberta_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {roberta_dump_folder_path}")
    from fairseq import checkpoint_utils
    state_dict = {
        "args":
        roberta.args,
        "model":
        roberta.model.state_dict(),
        # these last two were copied from fairseq pretrained just to make .from_pretrain() function works
        "extra_state": {
            'train_iterator': {
                'epoch': 0
            },
            'val_loss': 1.4955725940408326
        },
        "optimizer_history": [{
            'criterion_name': 'MaskedLmLoss',
            'optimizer_name': 'MemoryEfficientFP16Optimizer',
            'lr_scheduler_state': {
                'best': 1.495530066777925
            },
            'num_updates': 500000
        }]
    }
    from fairseq import checkpoint_utils
    # checkpoint_utils.save_state(f"{roberta_dump_folder_path}/model.pt", roberta.args, roberta.state_dict(), )
    # del model
    checkpoint_utils.torch_persistent_save(
        state_dict, f"{roberta_dump_folder_path}/model.pt")
    loaded_model = FairseqRobertaModel.from_pretrained(
        roberta_dump_folder_path)
    loaded_model.eval()

    # roberta.model(input_ids)
    # loaded_model.model(input_ids)

    del state_dict
    copied_dict = roberta.state_dict()
    loaded_dict = loaded_model.state_dict()
    assert loaded_model.state_dict().keys() == roberta.state_dict().keys()
    for k in roberta.state_dict().keys():
        loaded_val = loaded_dict[k]
        copied_val = copied_dict[k]
        if not torch.allclose(loaded_val, copied_val, atol=1e-3):
            print(k)
    loaded_output = loaded_model.model(input_ids)[0]
    save_success = torch.allclose(our_output, loaded_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if save_success else "💩")
    if not save_success:
        raise Exception("Something went wRoNg")
    # except:
    #     print("Fail to save")
    # torch.save(roberta, f"{roberta_dump_folder_path}/model.pt")
    print("Done")
Ejemplo n.º 8
0
def main(args):
    state = checkpoint_utils.load_checkpoint_to_cpu(args.checkpoint)
    ns = state["args"]
    model = state["model"]
    ns.arch = "transformer_modular"

    if (args.encoder_attention_heads_active is None
            and args.decoder_attention_heads_active is None):
        raise ValueError(
            'Either --encoder-attention-heads-active or '
            '--decoder-attention-heads-active option must be set.')
    if args.encoder_attention_heads_active is None:
        args.encoder_attention_heads_active = args.decoder_attention_heads_active

    if args.encoder_modular_layer_indices is not None:
        ns.encoder_modular_layer_indices = "({})".format(
            args.encoder_modular_layer_indices)
        model = convert_model(model, ns, coder="encoder", att_type="self_attn")
    if args.decoder_modular_layer_indices is not None:
        ns.decoder_modular_layer_indices = "({})".format(
            args.decoder_modular_layer_indices)
        model = convert_model(model, ns, coder="decoder", att_type="self_attn")
        model = convert_model(model,
                              ns,
                              coder="decoder",
                              att_type="encoder_attn")

    ctrl_enc = ModularCtrl(ns.encoder_embed_dim,
                           ns.encoder_attention_heads,
                           args.encoder_attention_heads_active,
                           hidden_depth=args.ctrl_hidden_depth,
                           hidden_dim=args.ctrl_hidden_dim,
                           ctrl_type=args.ctrl_type)
    ns.module_ctrl_hidden_depth = args.ctrl_hidden_depth
    ns.module_ctrl_hidden_dim = args.ctrl_hidden_dim
    ns.module_ctrl_type = args.ctrl_type

    for k, v in ctrl_enc.state_dict().items():
        model["encoder.module_ctrl.{}".format(k)] = v

    if not args.share_encoder_ctrl:
        if args.decoder_attention_heads_active is None:
            raise ValueError("Missing ``decoder-attention-heads-active'' "
                             "when ``share-encoder-ctrl'' is disabled.")
        ns.share_encoder_ctrl = False
        ctrl_dec = ModularCtrl(ns.decoder_embed_dim,
                               ns.decoder_attention_heads,
                               args.decoder_attention_heads_active,
                               hidden_depth=args.ctrl_hidden_depth,
                               hidden_dim=args.ctrl_hidden_dim,
                               ctrl_type=args.ctrl_type)
        for k, v in ctrl_dec.state_dict().items():
            model["decoder.module_ctrl.{}".format(k)] = v
    else:
        ns.share_encoder_ctrl = True

    ns.arch = "transformer_modular"
    ns.criterion = "label_smoothed_cross_entropy_modular"
    ns.task = "translation_modular"
    ns.encoder_attention_heads_active = args.encoder_attention_heads_active

    state["args"] = ns
    state["model"] = model

    for i, _ in enumerate(state["optimizer_history"]):
        state["optimizer_history"][i][
            "criterion_name"] = 'LabelSmoothedCrossEntropyModularCriterion'

    state = utils.move_to_cpu(state)

    with PathManager.open(args.save_as, "wb") as f:
        checkpoint_utils.torch_persistent_save(state, f)