Exemple #1
0
    def __init__(self, config: dict):
        """
        SemEval Model using Transformers.

        Args:
            config: configuration parameters
        """
        super().__init__()
        self.gradient_acc_steps = config.get("gradient_acc_steps")
        self.transformer = config.get("transformer")
        self.config = config
        self.data_loader = SemEvalDataloader(self.config)
        logger.info(
            f"Loaded {len(self.data_loader.train_generator)} fine-tuning samples."
        )

        self.tokenizer = self.data_loader.tokenizer
        self.tokenizer.convert_tokens_to_ids("[E1]")
        self.tokenizer.convert_tokens_to_ids("[E2]")

        self.model = BertModel.from_pretrained(
            model_size=self.config.get("transformer"),
            force_download=False,
            pretrained_model_name_or_path=self.config.get("transformer"),
            task="classification",
            n_classes=self.data_loader.n_classes,
        )
        self.model.resize_token_embeddings(len(self.tokenizer))
        pretrained_mtb_model = self.config.get("pretrained_mtb_model", None)
        if pretrained_mtb_model and os.path.isfile(pretrained_mtb_model):
            self._load_pretrained_model(pretrained_mtb_model)

        self.train_on_gpu = torch.cuda.is_available() and config.get(
            "use_gpu", True)
        if self.train_on_gpu:
            self.model.cuda()

        self.criterion = CrossEntropyLoss(reduction="sum")

        self._start_epoch = 0
        self._train_loss = []
        self._train_acc = []
        self._test_f1 = []
        self._test_acc = []
        self._best_test_f1 = 0
        self.checkpoint_dir = os.path.join("models", "finetuning", "sem_eval",
                                           self.transformer)
        Path(self.checkpoint_dir).mkdir(parents=True, exist_ok=True)

        self._points_seen = 0
    def __init__(self, config: dict):
        """
        Matching the Blanks Model.

        Args:
            config: configuration parameters
        """
        super().__init__()
        self.experiment_name = config.get("experiment_name")
        self.transformer = config.get("transformer")
        self.config = config
        self.data_loader = MTBPretrainDataLoader(self.config)
        self.train_len = len(self.data_loader.train_generator)
        logger.info("Loaded %d pre-training samples." % self.train_len)

        self.model = BertModel.from_pretrained(
            model_size=self.transformer,
            pretrained_model_name_or_path=self.transformer,
            force_download=False,
        )

        self.tokenizer = self.data_loader.tokenizer
        self.model.resize_token_embeddings(len(self.tokenizer))
        e1_id = self.tokenizer.convert_tokens_to_ids("[E1]")
        e2_id = self.tokenizer.convert_tokens_to_ids("[E2]")
        if e1_id == e2_id == 1:
            raise ValueError("e1_id == e2_id == 1")

        self.train_on_gpu = torch.cuda.is_available() and config.get(
            "use_gpu", True)
        if self.train_on_gpu:
            logger.info("Train on GPU")
            self.model.cuda()

        self.criterion = MTBLoss(lm_ignore_idx=self.tokenizer.pad_token_id, )
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        self.optimizer = AdamW(optimizer_grouped_parameters,
                               lr=self.config.get("lr"))
        ovr_steps = (self.config.get("epochs") *
                     len(self.data_loader.train_generator) *
                     self.config.get("max_size") * 2 /
                     self.config.get("batch_size"))
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer, ovr_steps // 10, ovr_steps)

        self._start_epoch = 0
        self._best_mtb_bce = 50
        self._train_loss = []
        self._train_lm_acc = []
        self._lm_acc = []
        self._mtb_bce = []
        self.checkpoint_dir = os.path.join("models", "MTB-pretraining",
                                           self.experiment_name,
                                           self.transformer)
        Path(self.checkpoint_dir).mkdir(parents=True, exist_ok=True)

        self._batch_points_seen = 0
        self._points_seen = 0
Exemple #3
0
    def __init__(self, args=None, detect_entities=False):
        if args is None:
            self.args = load_pickle("args.pkl")
        else:
            self.args = args
        self.cuda = torch.cuda.is_available()
        self.detect_entities = detect_entities

        if self.detect_entities:
            self.nlp = spacy.load("en_core_web_lg")
        else:
            self.nlp = None
        self.entities_of_interest = [
            "PERSON",
            "NORP",
            "FAC",
            "ORG",
            "GPE",
            "LOC",
            "PRODUCT",
            "EVENT",
            "WORK_OF_ART",
            "LAW",
            "LANGUAGE",
            "PER",
        ]

        logger.info("Loading tokenizer and model...")
        from .train_funcs import load_state

        if self.args.model_no == 0:
            from model.bert import BertModel as Model

            model = args.model_size  #'bert-base-uncased'
            model_name = "BERT"
            self.net = Model.from_pretrained(
                model,
                force_download=False,
                model_size=args.model_size,
                task="classification",
                n_classes_=self.args.num_classes,
            )
        elif self.args.model_no == 1:
            from model.albert.albert import AlbertModel as Model

            model = args.model_size  #'albert-base-v2'
            model_name = "BERT"
            self.net = Model.from_pretrained(
                model,
                force_download=False,
                model_size=args.model_size,
                task="classification",
                n_classes_=self.args.num_classes,
            )
        elif args.model_no == 2:  # BioBert
            from model.bert import BertModel, BertConfig

            model = "bert-base-uncased"
            model_name = "BioBERT"
            config = BertConfig.from_pretrained(
                "./additional_models/biobert_v1.1_pubmed/bert_config.json"
            )
            self.net = BertModel.from_pretrained(
                pretrained_model_name_or_path="./additional_models/biobert_v1.1_pubmed/biobert_v1.1_pubmed.bin",
                config=config,
                force_download=False,
                model_size="bert-base-uncased",
                task="classification",
                n_classes_=self.args.num_classes,
            )

        self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name)
        self.net.resize_token_embeddings(len(self.tokenizer))
        if self.cuda:
            self.net.cuda()
        start_epoch, best_pred, amp_checkpoint = load_state(
            self.net, None, None, self.args, load_best=False
        )
        logger.info("Done!")

        self.e1_id = self.tokenizer.convert_tokens_to_ids("[E1]")
        self.e2_id = self.tokenizer.convert_tokens_to_ids("[E2]")
        self.pad_id = self.tokenizer.pad_token_id
        self.rm = load_pickle("relations.pkl")
Exemple #4
0
    def __init__(self, args=None):
        if args is None:
            self.args = load_pickle("args.pkl")
        else:
            self.args = args
        self.cuda = torch.cuda.is_available()

        if self.args.model_no == 0:
            from model.bert import BertModel as Model
            from model.bert_tokenizer import BertTokenizer as Tokenizer

            model = args.model_size  #'bert-large-uncased' 'bert-base-uncased'
            model_name = "BERT"
            self.net = Model.from_pretrained(
                model,
                force_download=False,
                model_size=args.model_size,
                task="fewrel",
            )
        elif self.args.model_no == 1:
            from model.albert.albert import AlbertModel as Model
            from model.albert.albert_tokenizer import (
                AlbertTokenizer as Tokenizer,
            )

            model = args.model_size  #'albert-base-v2'
            model_name = "BERT"
            self.net = Model.from_pretrained(
                model,
                force_download=False,
                model_size=args.model_size,
                task="fewrel",
            )
        elif args.model_no == 2:  # BioBert
            from model.bert import BertModel, BertConfig
            from model.bert_tokenizer import BertTokenizer as Tokenizer

            model = "bert-base-uncased"
            model_name = "BioBERT"
            config = BertConfig.from_pretrained(
                "./additional_models/biobert_v1.1_pubmed/bert_config.json"
            )
            self.net = BertModel.from_pretrained(
                pretrained_model_name_or_path="./additional_models/biobert_v1.1_pubmed/biobert_v1.1_pubmed.bin",
                config=config,
                force_download=False,
                model_size="bert-base-uncased",
                task="fewrel",
            )

        if os.path.isfile("./data/%s_tokenizer.pkl" % model_name):
            self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name)
            logger.info("Loaded tokenizer from saved file.")
        else:
            logger.info(
                "Saved tokenizer not found, initializing new tokenizer..."
            )
            if args.model_no == 2:
                self.tokenizer = Tokenizer(
                    vocab_file="./additional_models/biobert_v1.1_pubmed/vocab.txt",
                    do_lower_case=False,
                )
            else:
                self.tokenizer = Tokenizer.from_pretrained(
                    model, do_lower_case=False
                )
            self.tokenizer.add_tokens(
                ["[E1]", "[/E1]", "[E2]", "[/E2]", "[BLANK]"]
            )
            save_as_pickle("%s_tokenizer.pkl" % model_name, self.tokenizer)
            logger.info(
                "Saved %s tokenizer at ./data/%s_tokenizer.pkl"
                % (model_name, model_name)
            )

        self.net.resize_token_embeddings(len(self.tokenizer))
        self.pad_id = self.tokenizer.pad_token_id

        if self.cuda:
            self.net.cuda()

        if self.args.use_pretrained_blanks == 1:
            logger.info(
                "Loading model pre-trained on blanks at ./data/test_checkpoint_%d.pth.tar..."
                % args.model_no
            )
            checkpoint_path = (
                "./data/test_checkpoint_%d.pth.tar" % self.args.model_no
            )
            checkpoint = torch.load(checkpoint_path)
            model_dict = self.net.state_dict()
            pretrained_dict = {
                k: v
                for k, v in checkpoint["state_dict"].items()
                if k in model_dict.keys()
            }
            model_dict.update(pretrained_dict)
            self.net.load_state_dict(pretrained_dict, strict=False)
            del checkpoint, pretrained_dict, model_dict

        logger.info("Loading Fewrel dataloaders...")
        self.train_loader, _, self.train_length, _ = load_dataloaders(args)