Exemple #1
0
    def parameter_setup(self, args):
        # Set trainability of this module.
        for param in self.model.parameters():
            param.requires_grad = bool(args.transfer_paradigm == "finetune")

        self.num_layers = self.model.config.num_hidden_layers
        if args.pytorch_transformers_max_layer >= 0:
            self.max_layer = args.pytorch_transformers_max_layer
            assert self.max_layer <= self.num_layers
        else:
            self.max_layer = self.num_layers

        # Configure scalar mixing, ELMo-style.
        if self.embeddings_mode == "mix":
            if args.transfer_paradigm == "frozen":
                log.warning(
                    "NOTE: pytorch_transformers_output_mode='mix', so scalar "
                    "mixing weights will be fine-tuned even if BERT "
                    "model is frozen.")
            # TODO: if doing multiple target tasks, allow for multiple sets of
            # scalars. See the ELMo implementation here:
            # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115
            assert len(parse_task_list_arg(args.target_tasks)) <= 1, (
                "pytorch_transformers_output_mode='mix' only supports a single set of "
                "scalars (but if you need this feature, see the TODO in "
                "the code!)")
            # Always have one more mixing weight, for lexical layer.
            self.scalar_mix = scalar_mix.ScalarMix(self.max_layer + 1,
                                                   do_layer_norm=False)
    def __init__(self, args, n_special=3, n_ctx=512):
        super(OpenAIEmbedderModule, self).__init__()
        self.model_cfg = model_pytorch.DEFAULT_CONFIG
        self.n_special = n_special  # number of special tokens
        self.n_ctx = n_ctx  # max context width (seq len)

        full_emb_vocab = N_VOCAB + self.n_special + self.n_ctx
        self.model = TransformerModel(
            self.model_cfg,
            vocab=full_emb_vocab,
            embeddings_mode=args.openai_embeddings_mode)

        # Need specific seed to reproduce results.
        seed = 42
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        if args.openai_transformer_ckpt:
            assert n_special == 3
            log.info("Loading OpenAI transformer model from %s",
                     args.openai_transformer_ckpt)
            load_from_tf_checkpoint(self.model, args.openai_transformer_ckpt)
        else:
            loader_args = dict(n_special=n_special)
            # Path to model weights
            loader_args["path"] = OPENAI_DATA_DIR + "/"
            # Path to variable name mapping
            loader_args["path_names"] = os.path.dirname(
                model_pytorch.__file__) + "/"
            # Load pretrained weights from disk
            log.info("Loading OpenAI transformer model from %s",
                     loader_args["path"])
            model_pytorch.load_openai_pretrained_model(self.model,
                                                       **loader_args)
        log.info("Loaded OpenAI transformer model.")

        # Set trainability of this module.
        for param in self.model.parameters():
            param.requires_grad = bool(args.transfer_paradigm == "finetune")

        # Configure scalar mixing, ELMo-style.
        if args.openai_embeddings_mode == "mix":
            # TODO: if doing multiple target tasks, allow for multiple sets of
            # scalars. See the ELMo implementation here:
            # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115
            assert len(parse_task_list_arg(args.target_tasks)) <= 1, (
                "openai_embeddings_mode='mix' only supports a single set of "
                "scalars (but if you need this feature, see the TODO in "
                "the code!)")
            if args.transfer_paradigm == "frozen":
                log.warning("NOTE: openai_embeddings_mode='mix', so scalar "
                            "mixing weights will be fine-tuned even if "
                            "transformer weights are frozen.")
            # Make sure scalar mix is always tunable.
            for param in self.model.scalar_mix.parameters():
                param.requires_grad = True
Exemple #3
0
    def __init__(self, args, cache_dir=None):
        super(BertEmbedderModule, self).__init__()

        self.model = pytorch_pretrained_bert.BertModel.from_pretrained(
            args.input_module, cache_dir=cache_dir)
        self.embeddings_mode = args.bert_embeddings_mode
        self.num_layers = self.model.config.num_hidden_layers
        if args.bert_max_layer >= 0:
            self.max_layer = args.bert_max_layer
        else:
            self.max_layer = self.num_layers
        assert self.max_layer <= self.num_layers

        tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
            args.input_module, cache_dir=cache_dir)
        self._sep_id = tokenizer.vocab["[SEP]"]
        self._pad_id = tokenizer.vocab["[PAD]"]

        # Set trainability of this module.
        for param in self.model.parameters():
            param.requires_grad = bool(args.transfer_paradigm == "finetune")

        # Configure scalar mixing, ELMo-style.
        if self.embeddings_mode == "mix":
            if args.transfer_paradigm == "frozen":
                log.warning("NOTE: bert_embeddings_mode='mix', so scalar "
                            "mixing weights will be fine-tuned even if BERT "
                            "model is frozen.")
            # TODO: if doing multiple target tasks, allow for multiple sets of
            # scalars. See the ELMo implementation here:
            # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115
            assert len(parse_task_list_arg(args.target_tasks)) <= 1, (
                "bert_embeddings_mode='mix' only supports a single set of "
                "scalars (but if you need this feature, see the TODO in "
                "the code!)")
            # Always have one more mixing weight, for lexical layer.
            self.scalar_mix = scalar_mix.ScalarMix(self.max_layer + 1,
                                                   do_layer_norm=False)