def create_and_check_bert_for_pretraining(self, config, input_ids,
                                           token_type_ids, input_mask,
                                           sequence_labels,
                                           token_labels, choice_labels):
     model = BertForPreTraining(config=config)
     model.to(torch_device)
     model.eval()
     loss, prediction_scores, seq_relationship_score = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         masked_lm_labels=token_labels,
         next_sentence_label=sequence_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
         "seq_relationship_score": seq_relationship_score,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.parent.assertListEqual(
         list(result["seq_relationship_score"].size()),
         [self.batch_size, 2])
     self.check_loss_output(result)
Beispiel #2
0
def test_model():
    hf_config = BertConfig(
        vocab_size=1000,
        hidden_size=100,
        num_attention_heads=2,
        intermediate_size=256,
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        max_position_embeddings=100,
        num_hidden_layers=1,
    )

    config = BertBackboneConfig(
        hidden_dim=hf_config.hidden_size,
        n_heads=hf_config.num_attention_heads,
        layer_norm_eps=hf_config.layer_norm_eps,
        intermediate_dim=hf_config.intermediate_size,
        n_layers=hf_config.num_hidden_layers,
        n_pos=hf_config.max_position_embeddings,
        n_types=hf_config.type_vocab_size,
        vocab_size=hf_config.vocab_size,
        pad_token_id=hf_config.pad_token_id,
        attention_probs_dropout=hf_config.attention_probs_dropout_prob,
        hidden_dropout=hf_config.hidden_dropout_prob)

    bs = 8
    seq_len = 12

    seed_everything(228)
    hf_model = BertForPreTraining(hf_config)
    token_ids = torch.randint(low=0,
                              high=hf_config.vocab_size,
                              size=(bs, seq_len))
    clf_labels = torch.randint(low=0, high=2, size=(bs, ))
    hf_loss = hf_model(token_ids,
                       masked_lm_labels=token_ids,
                       next_sentence_label=clf_labels)[0]

    seed_everything(228)
    backbone = BertBackbone(config)

    model = BertPreTrainingModel(backbone=backbone)
    token_ids = torch.randint(low=0,
                              high=hf_config.vocab_size,
                              size=(bs, seq_len))
    clf_labels = torch.randint(low=0, high=2, size=(bs, ))

    inp = BertBackboneInput(token_ids=token_ids,
                            token_type_ids=None,
                            token_pos=None)

    loss = model(inp, head_labels={'lm': token_ids, 'clf': clf_labels}).loss
Beispiel #3
0
def get_bert_save_dict():
    import os

    state_path = 'data/bert-large.pt'

    if os.path.exists(state_path):
        state = torch.load(state_path)
    else:
        model = BertForPreTraining.from_pretrained(globals.bert_model)
        state = model.state_dict()
        # cache state
        torch.save(state, state_path)
    return state
def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path,
                                      output_folder):
    # Instantiate model
    logger.info(f'Loading model based on config from {config_path}...')
    config = BertConfig.from_json_file(config_path)
    model = BertForPreTraining(config)

    # Load weights from checkpoint
    logger.info(f'Loading weights from checkpoint {tf_checkpoint_path}...')
    load_tf2_weights_in_bert(model, tf_checkpoint_path, config)

    # Create dirs
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)

    # Save pytorch-model
    f_out_model = os.path.join(output_folder, 'pytorch_model.bin')
    logger.info(f'Saving PyTorch model to {f_out_model}...')
    torch.save(model.state_dict(), f_out_model)

    # Save config to output
    f_out_config = os.path.join(output_folder, 'config.json')
    logger.info(f'Saving config to {f_out_config}...')
    config.to_json_file(f_out_config)
Beispiel #5
0
    def _load_google_checkpoint(self):
        logger.info('Loading Checkpoint from Google for Pre training')

        download_and_extract(self.google_checkpoint_location, './')

        checkpoint_dir = os.path.join('./', self.google_checkpoint_root)
        config_location = os.path.join(checkpoint_dir, 'bert_config.json')
        index_location = os.path.join(checkpoint_dir, 'bert_model.ckpt.index')

        logger.info(
            f'Config file: {config_location}. Index file: {index_location}')

        config = BertConfig.from_json_file(config_location)
        self.bert = BertForPreTraining.from_pretrained(index_location,
                                                       config=config,
                                                       from_tf=True)
Beispiel #6
0
 def from_pretrained(self, model_dir):
     self.encoder_config = BertConfig.from_pretrained(model_dir)
     self.tokenizer = BertTokenizer.from_pretrained(
         path.join(model_dir, 'tokenizer'),
         do_lower_case=args.do_lower_case)
     self.utt_encoder = BertForPreTraining.from_pretrained(
         path.join(model_dir, 'utt_encoder'))
     self.context_encoder = BertForSequenceClassification.from_pretrained(
         path.join(model_dir, 'context_encoder'))
     self.context_mlm_trans = BertPredictionHeadTransform(
         self.encoder_config)
     self.context_mlm_trans.load_state_dict(
         torch.load(path.join(model_dir, 'context_mlm_trans.pkl')))
     self.context_order_trans = SelfSorting(self.encoder_config.hidden_size)
     self.context_order_trans.load_state_dict(
         torch.load(path.join(model_dir, 'context_order_trans.pkl')))
     self.decoder_config = BertConfig.from_pretrained(model_dir)
     self.decoder = BertLMHeadModel.from_pretrained(
         path.join(model_dir, 'decoder'))
 def create_and_check_for_pretraining(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = BertForPreTraining(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         labels=token_labels,
         next_sentence_label=sequence_labels,
     )
     self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
     self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
Beispiel #8
0
def test():
    bert_model_path = '../checkpoints/bert-base-chinese/'  # pytorch_model.bin
    bert_config_path = '../checkpoints/bert-base-chinese/'  # bert_config.json
    vocab_path = '../checkpoints/bert-base-chinese/vocab.txt'

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    # model = BertModel.from_pretrained(bert_model_path, config=bert_config_path)
    model = BertForPreTraining.from_pretrained(bert_model_path,
                                               config=bert_config_path)

    text_batch = ["哈哈哈", "嘿嘿嘿", "嘿嘿嘿", "嘿嘿嘿"]
    encoding = tokenizer(text_batch,
                         return_tensors='pt',
                         padding=True,
                         truncation=True)
    input_ids = encoding['input_ids']
    print(input_ids)
    print(input_ids.shape)
    output1, output2 = model(input_ids)
    print(output1)
    print(output2)
    print(output1.shape)
    print(output2.shape)
Beispiel #9
0
    def __init__(self,
                 pretrained_model,
                 tokenizer_name_or_path: str,
                 data_dir: str,
                 batch_size: int,
                 max_train_examples: int = None,
                 max_eval_examples: int = None,
                 train_strategy='train-all-lexical') -> None:
        super(LexicalTrainingModel, self).__init__()

        self.save_hyperparameters()

        if pretrained_model.startswith('google-checkpoint'):
            self._load_google_checkpoint()
        else:
            self.bert = BertForPreTraining.from_pretrained(pretrained_model)

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name_or_path)

        self.__setup_lexical_for_training()

        self.train_dataset = None
        self.eval_dataset = None
        self.test_dataset = None
Beispiel #10
0
from transformers import BertTokenizer, BertForPreTraining, BertForSequenceClassification
from tqdm import tqdm, trange
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

max_length=100
k=10
device="cpu"

pretrained_weights = '/data5/private/suyusheng/task_selecte/bert-base-uncased-128/'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)

fine_tuned_weight = '/data5/private/suyusheng/task_selecte/output_finetune/pytorch_model.bin_1314'
model = BertForPreTraining.from_pretrained(pretrained_weights, output_hidden_states=True,return_dict=True)
model.load_state_dict(torch.load(fine_tuned_weight), strict=False)
model.to(device)


#out_CLS = torch.load("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_CLS.pt")
out_CLS = torch.load("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_CLS_res.pt")
out_CLS = out_CLS.to(device)

#with open("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain.json") as f:
with open("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_res.json") as f:
    out_data = json.load(f)

with open("../data/restaurant/train.json") as f:
    data = json.load(f)
    for index, d in enumerate(tqdm(data)):
Beispiel #11
0
class NERTagger(pl.LightningModule):
    def __init__(self, hparams):
        """
        input:
            hparams: namespace with the following items:
                'data_dir' (str): Data Directory. default: './official/ebm_nlp_1_00'
                'bioelmo_dir' (str): BioELMo Directory. default: './models/bioelmo', help='BioELMo Directory')
                'max_length' (int): Max Length. default: 1024
                'lr' (float): Learning Rate. default: 1e-2
                'fine_tune_bioelmo' (bool): Whether to Fine Tune BioELMo. default: False
                'lr_bioelmo' (float): Learning Rate in BioELMo Fine-tuning. default: 1e-4
        """
        super().__init__()
        self.hparams = hparams
        self.itol = ID_TO_LABEL
        self.ltoi = {v: k for k, v in self.itol.items()}

        if self.hparams.model == "bioelmo":
            # Load Pretrained BioELMo
            DIR_ELMo = pathlib.Path(str(self.hparams.bioelmo_dir))
            self.bioelmo = self.load_bioelmo(
                DIR_ELMo, not self.hparams.fine_tune_bioelmo
            )
            self.bioelmo_output_dim = self.bioelmo.get_output_dim()

            # ELMo Padding token (In ELMo token with ID 0 is used for padding)
            VOCAB_FILE_PATH = DIR_ELMo / "vocab.txt"
            command = shlex.split(f"head -n 1 {VOCAB_FILE_PATH}")
            res = subprocess.Popen(command, stdout=subprocess.PIPE)
            self.bioelmo_pad_token = res.communicate()[0].decode("utf-8").strip()

            # Initialize Intermediate Affine Layer
            self.hidden_to_tag = nn.Linear(int(self.bioelmo_output_dim), len(self.itol))

        elif self.hparams.model == "biobert":
            # Load Pretrained BioBERT
            PATH_BioBERT = pathlib.Path(str(self.hparams.biobert_path))
            self.bertconfig = BertConfig.from_pretrained(self.hparams.bert_model_type)
            self.bertforpretraining = BertForPreTraining(self.bertconfig)
            self.bertforpretraining.load_tf_weights(self.bertconfig, PATH_BioBERT)
            self.biobert = self.bertforpretraining.bert
            self.tokenizer = BertTokenizer.from_pretrained(self.hparams.bert_model_type)

            # Freeze BioBERT if fine-tune not desired
            if not self.hparams.fine_tune_biobert:
                for n, m in self.biobert.named_parameters():
                    m.requires_grad = False

            # Initialize Intermediate Affine Layer
            self.hidden_to_tag = nn.Linear(
                int(self.bertconfig.hidden_size), len(self.itol)
            )

        # Initialize CRF
        TRANSITIONS = conditional_random_field.allowed_transitions(
            constraint_type="BIO", labels=self.itol
        )
        self.crf = conditional_random_field.ConditionalRandomField(
            # set to 3 because here "tags" means ['O', 'B', 'I']
            # no need to include 'BOS' and 'EOS' in "tags"
            num_tags=len(self.itol),
            constraints=TRANSITIONS,
            include_start_end_transitions=False,
        )
        self.crf.reset_parameters()

    @staticmethod
    def load_bioelmo(bioelmo_dir: str, freeze: bool) -> Elmo:
        # Load Pretrained BioELMo
        DIR_ELMo = pathlib.Path(bioelmo_dir)
        bioelmo = Elmo(
            DIR_ELMo / "biomed_elmo_options.json",
            DIR_ELMo / "biomed_elmo_weights.hdf5",
            1,
            requires_grad=bool(not freeze),
            dropout=0,
        )
        return bioelmo

    def get_device(self):
        return self.crf.state_dict()["transitions"].device

    def _forward_bioelmo(self, tokens) -> Tuple[torch.Tensor, torch.Tensor]:
        # character_ids: torch.tensor(n_batch, len_max)
        # documents will be padded to have the same token lengths as the longest document
        character_ids = batch_to_ids(tokens)
        character_ids = character_ids[:, : self.hparams.max_length, :]
        character_ids = character_ids.to(self.get_device())

        # characted_ids -> BioELMo hidden state of the last layer & mask
        out = self.bioelmo(character_ids)
        hidden = out["elmo_representations"][-1]
        crf_mask = out["mask"].to(torch.bool).to(self.get_device())

        return (hidden, crf_mask)

    def _forward_biobert(
        self, tokens: List[List[str]]
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Return BioBERT Hidden state for the tokenized documents.
        Documents with different lengths will be accepted.

        list(list(str)) -> tuple(torch.tensor, torch.tensor)
        """
        # Convert each token of each document into a list of subwords.
        # e.g.,
        #   [['Admission', 'Date', ...], ['Service', ':', ...]]
        #       |
        #       V
        #   [[['Ad', '##mission'], ['Date'], ...], [['Service'], [':'], ...]]
        subwords_unchained = [
            [self.tokenizer.tokenize(tok) for tok in doc] for doc in tokens
        ]

        # Simply replace each token of each document with corresponding subwords.
        # e.g.,
        #   [['Admission', 'Date', ...], ['Service', ':', ...]]
        #       |
        #       V
        #   [['Ad', '##mission', 'Date', ...], ['Service', ':', ...]]
        subwords = [
            list(itertools.chain(*[self.tokenizer.tokenize(tok) for tok in doc]))
            for doc in tokens
        ]

        # Memorize (i) header place of each token and (ii) how many subwords each token gave birth.
        # e.g.,
        #   For document ['Admission', 'Date'] -> ['Ad', '##mission', 'Date'],
        #   subword_info will be {'start':[0,2], 'length':[2,1]}.
        subword_info = []
        for doc in subwords_unchained:
            word_lengths = [len(word) for word in doc]
            word_head_ix = [0]
            for i in range(len(word_lengths) - 1):
                word_head_ix.append(word_head_ix[-1] + word_lengths[i])
            assert len(word_lengths) == len(word_head_ix)
            subword_info.append({"start": word_head_ix, "length": word_lengths})

        assert [len(info["start"]) for info in subword_info] == [
            len(doc) for doc in tokens
        ]

        # Split each document into chunks shorter than max_length.
        # Here, each document will be simply split at every 510 tokens.

        max_length = min(
            self.bertconfig.max_position_embeddings, self.hparams.max_length
        )

        longest_length = max([len(doc) for doc in subwords])
        n_chunks = (longest_length - 1) // (max_length - 2) + 1
        chunks = []
        for n in range(n_chunks):
            chunk_of_all_documents = []
            for document in subwords:
                chunk_of_single_document = document[
                    (max_length - 2) * n : (max_length - 2) * (n + 1)
                ]
                if chunk_of_single_document == []:
                    chunk_of_all_documents.append([""])
                else:
                    chunk_of_all_documents.append(chunk_of_single_document)
            chunks.append(chunk_of_all_documents)

        # Convert chunks into BERT input form.
        inputs = []
        for chunk in chunks:
            if type(chunk) is str:
                unsqueezed_chunk = [[chunk]]
            elif type(chunk) is list:
                if type(chunk[0]) is str:
                    unsqueezed_chunk = [chunk]
                elif type(chunk[0]) is list:
                    unsqueezed_chunk = chunk

            inputs.append(
                self.tokenizer.batch_encode_plus(
                    unsqueezed_chunk,
                    pad_to_max_length=True,
                    is_pretokenized=True,
                )
            )

        # Get BioBERT hidden states.
        hidden_states = []
        for inpt in inputs:
            inpt_tensors = {
                k: torch.tensor(v).to(self.get_device()) for k, v in inpt.items()
            }
            hidden_state = self.biobert(**inpt_tensors)[0][:, 1:-1, :]
            hidden_states.append(hidden_state)

        # Concatenate hidden states from each chunk.
        hidden_states_cat = torch.cat(hidden_states, dim=1)

        # If a word was tokenized into multiple subwords, take average of them.
        # e.g. Hidden state for "Admission" equals average of hidden states for "Ad" and "##mission"
        hidden_states_shrunk = torch.zeros_like(hidden_states_cat)
        for n in range(hidden_states_cat.size()[0]):
            hidden_state_shrunk = torch.stack(
                [
                    torch.narrow(hidden_states_cat[n], dim=0, start=s, length=l).mean(
                        dim=0
                    )
                    for s, l in zip(subword_info[n]["start"], subword_info[n]["length"])
                ]
            )
            hidden_states_shrunk[
                n, : hidden_state_shrunk.size()[0], :
            ] = hidden_state_shrunk

        # Truncate lengthy tail that will not be used.
        hidden_states_shrunk = hidden_states_shrunk[
            :, : max([len(doc) for doc in tokens]), :
        ]

        # Create mask for CRF.
        crf_mask = torch.zeros(hidden_states_shrunk.size()[:2]).to(torch.uint8)
        for i, length in enumerate([len(doc) for doc in tokens]):
            crf_mask[i, :length] = 1
        crf_mask = crf_mask > 0
        crf_mask = crf_mask.to(self.get_device())

        return (hidden_states_shrunk, crf_mask)

    def _forward_crf(
        self,
        hidden: torch.Tensor,
        gold_tags_padded: torch.Tensor,
        crf_mask: torch.Tensor,
    ) -> Dict:
        """
        input:
            hidden (torch.tensor) (n_batch, seq_length, hidden_dim)
            gold_tags_padded (torch.tensor) (n_batch, seq_length)
            crf_mask (torch.bool) (n_batch, seq_length)
        output:
            result (dict)
                'log_likelihood' : torch.tensor
                'pred_tags_packed' : torch.nn.utils.rnn.PackedSequence
                'gold_tags_padded' : torch.tensor
        """
        result = {}

        if not (hidden.size()[1] == gold_tags_padded.size()[1] == crf_mask.size()[1]):
            raise RuntimeError(
                "seq_length of hidden, gold_tags_padded, and crf_mask do not match: "
                + f"{hidden.size()}, {gold_tags_padded.size()}, {crf_mask.size()}"
            )

        if gold_tags_padded is not None:
            # Training Mode
            # Log likelihood
            log_prob = self.crf.forward(hidden, gold_tags_padded, crf_mask)

            # top k=1 tagging
            Y = [
                torch.tensor(result[0])
                for result in self.crf.viterbi_tags(logits=hidden, mask=crf_mask)
            ]
            Y = rnn.pack_sequence(Y, enforce_sorted=False)

            result["log_likelihood"] = log_prob
            result["pred_tags_packed"] = Y
            result["gold_tags_padded"] = gold_tags_padded
            return result

        else:
            # Prediction Mode
            # top k=1 tagging
            Y = [
                torch.tensor(result[0])
                for result in self.crf.viterbi_tags(logits=hidden, mask=crf_mask)
            ]
            Y = rnn.pack_sequence(Y, enforce_sorted=False)
            result["pred_tags_packed"] = Y
            return result

    def forward(self, tokens, gold_tags=None):
        """
        Main NER tagging function.
        Documents with different token lengths are accepted.

        input:
            tokens (list(list(str))): List of documents for the batch. Each document must be stored as a list of tokens.
            gold_tags (list(list(int))): List of gold labels for each document of the batch.
        output:
            result (dict)
                'log_likelihood' : torch.tensor
                'pred_tags_packed' : torch.nn.utils.rnn.PackedSequence
                'gold_tags_padded' : torch.tensor
        """
        if self.hparams.model == "bioelmo":
            # BioELMo features
            hidden, crf_mask = self._forward_bioelmo(tokens)

        elif self.hparams.model == "biobert":
            # BioELMo features
            hidden, crf_mask = self._forward_biobert(tokens)

        # Turn on gradient tracking
        # Affine transformation (Hidden_dim -> N_tag)
        hidden.requires_grad_()
        hidden = self.hidden_to_tag(hidden)

        if gold_tags is not None:
            gold_tags = [torch.tensor(seq) for seq in gold_tags]
            gold_tags_padded = rnn.pad_sequence(
                gold_tags, batch_first=True, padding_value=self.ltoi["O"]
            )
            gold_tags_padded = gold_tags_padded[:, : self.hparams.max_length]
            gold_tags_padded = gold_tags_padded.to(self.get_device())
        else:
            gold_tags_padded = None

        result = self._forward_crf(hidden, gold_tags_padded, crf_mask)
        return result

    def recognize_named_entity(self, token, gold_tags=None):
        """
        Alias of self.forward().
        """
        return self.forward(token, gold_tags)

    def step(self, batch, batch_nb, *optimizer_idx):
        tokens_nopad = batch["tokens"]
        tags_nopad = batch["tags"]

        assert list(map(len, tokens_nopad)) == list(
            map(len, tags_nopad)
        ), "ERROR: the number of tokens and BIO tags are different in some record."

        # Negative Log Likelihood
        result = self.forward(tokens_nopad, tags_nopad)
        returns = {
            "loss": result["log_likelihood"] * (-1.0),
            "T": result["gold_tags_padded"],
            "Y": result["pred_tags_packed"],
            "I": batch["ix"],
        }

        assert (
            torch.isnan(returns["loss"]).sum().item() == 0
        ), "Loss function contains nan."
        return returns

    def unpack_pred_tags(self, Y_packed):
        """
        input:
            Y_packed: torch.nn.utils.rnn.PackedSequence
        output:
            Y: list(list(str))
                Predicted NER tagging sequence.
        """
        Y_padded, Y_len = rnn.pad_packed_sequence(
            Y_packed, batch_first=True, padding_value=-1
        )
        Y_padded = Y_padded.numpy().tolist()
        Y_len = Y_len.numpy().tolist()

        # Replace B- tag with I- tag
        # because the original paper defines the NER task as identification of spans, not entities
        Y = [
            [self.itol[ix].replace("B-", "I-") for ix in ids[:length]]
            for ids, length in zip(Y_padded, Y_len)
        ]

        return Y

    def unpack_gold_and_pred_tags(self, T_padded, Y_packed):
        """
        input:
            T_padded: torch.tensor
            Y_packed: torch.nn.utils.rnn.PackedSequence
        output:
            T: list(list(str))
                Gold NER tagging sequence.
            Y: list(list(str))
                Predicted NER tagging sequence.
        """
        Y = self.unpack_pred_tags(Y_packed)
        Y_len = [len(seq) for seq in Y]

        T_padded = T_padded.numpy().tolist()

        # Replace B- tag with I- tag
        # because the original paper defines the NER task as identification of spans, not entities
        T = [
            [self.itol[ix] for ix in ids[:length]]
            for ids, length in zip(T_padded, Y_len)
        ]

        return T, Y

    def gather_outputs(self, outputs):
        if len(outputs) > 1:
            loss = torch.mean(torch.tensor([output["loss"] for output in outputs]))
        else:
            loss = outputs[0]["loss"]

        IX = []
        Y = []
        T = []

        for output in outputs:
            T_batch, Y_batch = self.unpack_gold_and_pred_tags(
                output["T"].cpu(), output["Y"].cpu()
            )
            T += T_batch
            Y += Y_batch
            IX += output["I"].cpu().numpy().tolist()

        returns = {"loss": loss, "T": T, "Y": Y, "I": IX}

        return returns

    def training_step(self, batch, batch_nb, *optimizer_idx) -> Dict:
        # Process on individual mini-batches
        """
        (batch) -> (dict or OrderedDict)
        # Caution: key for loss function must exactly be 'loss'.
        """
        return self.step(batch, batch_nb, *optimizer_idx)

    def training_epoch_end(self, outputs: Union[List[Dict], List[List[Dict]]]) -> Dict:
        """
        outputs(list of dict) -> loss(dict or OrderedDict)
        # Caution: key must exactly be 'loss'.
        """
        outs = self.gather_outputs(outputs)
        loss = outs["loss"]
        Y = outs["Y"]
        T = outs["T"]

        get_logger(self.hparams.version).info(
            f"========== Training Epoch {self.current_epoch} =========="
        )
        get_logger(self.hparams.version).info(f"Loss: {loss.item()}")
        get_logger(self.hparams.version).info(
            f"Entity-wise classification report\n{seq_classification_report(T, Y, 4)}"
        )

        progress_bar = {"train_loss": loss}
        returns = {"loss": loss, "progress_bar": progress_bar}
        return returns

    def validation_step(self, batch, batch_nb) -> Dict:
        # Process on individual mini-batches
        """
        (batch) -> (dict or OrderedDict)
        """
        return self.step(batch, batch_nb)

    def validation_epoch_end(
        self, outputs: Union[List[Dict], List[List[Dict]]]
    ) -> Dict:
        """
        For single dataloader:
            outputs(list of dict) -> (dict or OrderedDict)
        For multiple dataloaders:
            outputs(list of (list of dict)) -> (dict or OrderedDict)
        """
        outs = self.gather_outputs(outputs)
        loss = outs["loss"]
        Y = outs["Y"]
        T = outs["T"]

        get_logger(self.hparams.version).info(
            f"========== Validation Epoch {self.current_epoch} =========="
        )
        get_logger(self.hparams.version).info(f"Loss: {loss.item()}")
        get_logger(self.hparams.version).info(
            f"Entity-wise classification report\n{seq_classification_report(T, Y, 4)}"
        )

        progress_bar = {"val_loss": loss}
        returns = {"val_loss": loss, "progress_bar": progress_bar}
        return returns

    def test_step(self, batch, batch_nb) -> Dict:
        # Process on individual mini-batches
        """
        (batch) -> (dict or OrderedDict)
        """
        return self.step(batch, batch_nb)

    def test_epoch_end(self, outputs: Union[List[Dict], List[List[Dict]]]) -> Dict:
        """
        For single dataloader:
            outputs(list of dict) -> (dict or OrderedDict)
        For multiple dataloaders:
            outputs(list of (list of dict)) -> (dict or OrderedDict)
        """
        outs = self.gather_outputs(outputs)
        loss = outs["loss"]
        Y = outs["Y"]
        T = outs["T"]

        get_logger(self.hparams.version).info("========== Test ==========")
        get_logger(self.hparams.version).info(f"Loss: {loss.item()}")
        get_logger(self.hparams.version).info(
            f"Entity-wise classification report\n{seq_classification_report(T, Y, 4)}"
        )

        progress_bar = {"test_loss": loss}
        returns = {"test_loss": loss, "progress_bar": progress_bar}
        return returns

    def configure_optimizers(
        self,
    ) -> Union[torch.optim.Optimizer, List[torch.optim.Optimizer]]:
        if self.hparams.model == "bioelmo":
            if self.hparams.fine_tune_bioelmo:
                optimizer_bioelmo_1 = optim.Adam(
                    self.bioelmo.parameters(), lr=float(self.hparams.lr_bioelmo)
                )
                optimizer_bioelmo_2 = optim.Adam(
                    self.hidden_to_tag.parameters(), lr=float(self.hparams.lr_bioelmo)
                )
                optimizer_crf = optim.Adam(
                    self.crf.parameters(), lr=float(self.hparams.lr)
                )
                return [optimizer_bioelmo_1, optimizer_bioelmo_2, optimizer_crf]
            else:
                optimizer = optim.Adam(self.parameters(), lr=float(self.hparams.lr))
                return optimizer

        elif self.hparams.model == "biobert":
            if self.hparams.fine_tune_biobert:
                optimizer_biobert_1 = optim.Adam(
                    self.biobert.parameters(), lr=float(self.hparams.lr_biobert)
                )
                optimizer_biobert_2 = optim.Adam(
                    self.hidden_to_tag.parameters(), lr=float(self.hparams.lr_biobert)
                )
                optimizer_crf = optim.Adam(
                    self.crf.parameters(), lr=float(self.hparams.lr)
                )
                return [optimizer_biobert_1, optimizer_biobert_2, optimizer_crf]
            else:
                optimizer = optim.Adam(self.parameters(), lr=float(self.hparams.lr))
                return optimizer

    def train_dataloader(self) -> torch.utils.data.DataLoader:
        ds_train = NERDataset.from_dirnames(self.hparams.train_dirs)
        dl_train = NERDataLoader(
            ds_train, batch_size=self.hparams.batch_size, shuffle=True
        )
        return dl_train

    def val_dataloader(self) -> torch.utils.data.DataLoader:
        ds_val = NERDataset.from_dirnames(self.hparams.val_dirs)
        dl_val = NERDataLoader(
            ds_val, batch_size=self.hparams.batch_size, shuffle=False
        )
        return dl_val

    def test_dataloader(self) -> torch.utils.data.DataLoader:
        ds_test = NERDataset.from_dirnames(self.hparams.test_dirs)
        dl_test = NERDataLoader(
            ds_test, batch_size=self.hparams.batch_size, shuffle=False
        )
        return dl_test
def prepare_model(args, device):
    config = BertConfig.from_pretrained('bert-base-uncased',
                                        cache_dir=args.cache_dir)

    # config.num_hidden_layers = 12
    if args.force_num_hidden_layers:
        logger.info("Modifying model config with num_hidden_layers to %d",
                    args.force_num_hidden_layers)
        config.num_hidden_layers = args.force_num_hidden_layers

    model = BertForPreTraining(config)
    if args.init_state_dict is not None:
        model.load_state_dict(args.init_state_dict, strict=False)
    model_desc = bert_model_description(config)

    lr_scheduler = LinearWarmupLRScheduler(total_steps=int(args.max_steps),
                                           warmup=args.warmup_proportion)

    loss_scaler = amp.DynamicLossScaler() if args.fp16 else None

    options = orttrainer.ORTTrainerOptions({
        'batch': {
            'gradient_accumulation_steps': args.gradient_accumulation_steps
        },
        'device': {
            'id': str(device)
        },
        'mixed_precision': {
            'enabled': args.fp16,
            'loss_scaler': loss_scaler
        },
        'debug': {
            'deterministic_compute': True,
        },
        'utils': {
            'grad_norm_clip': True
        },
        'distributed': {
            'world_rank': max(0, args.local_rank),
            'world_size': args.world_size,
            'local_rank': max(0, args.local_rank),
            'allreduce_post_accumulation': args.allreduce_post_accumulation,
            'deepspeed_zero_optimization': {
                'stage': args.deepspeed_zero_stage
            }
        },
        'lr_scheduler': lr_scheduler
    })

    param_optimizer = list(model.named_parameters())
    no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"]
    params = [{
        'params': [
            n for n, p in param_optimizer
            if any(no_decay_key in n for no_decay_key in no_decay_keys)
        ],
        "alpha":
        0.9,
        "beta":
        0.999,
        "lambda":
        0.0,
        "epsilon":
        1e-6
    }, {
        'params': [
            n for n, p in param_optimizer
            if not any(no_decay_key in n for no_decay_key in no_decay_keys)
        ],
        "alpha":
        0.9,
        "beta":
        0.999,
        "lambda":
        0.0,
        "epsilon":
        1e-6
    }]

    optim_config = optim.AdamConfig(params=params,
                                    lr=2e-5,
                                    do_bias_correction=True)
    model = orttrainer.ORTTrainer(model,
                                  model_desc,
                                  optim_config,
                                  options=options)

    return model
Beispiel #13
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file",
                        default="manual_description.txt",
                        type=str,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default="out",
        type=str,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=200,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=4.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        default=True,
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_file)
        train_dataset = BERTDataset(args.train_file,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForPreTraining.from_pretrained(
        args.bert_model, config=BertConfig.from_pretrained(args.bert_model))
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        train_sampler = RandomSampler(train_dataset)

        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        for epoch in trange(1, int(args.num_train_epochs) + 1, desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            print("epoch=", epoch)
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", position=0)):

                with torch.no_grad():
                    batch = (item.cuda(device=device) for item in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                model.train()
                optimizer.zero_grad()
                prediction_scores, seq_relationship_score = model(
                    input_ids=input_ids,
                    attention_mask=input_mask,
                    token_type_ids=segment_ids)

                if lm_label_ids is not None and is_next is not None:
                    loss_fct = CrossEntropyLoss(ignore_index=-1)
                    #masked_lm_loss = loss_fct(prediction_scores.view(-1, model.config.vocab_size),lm_label_ids.view(-1))
                    next_sentence_loss = loss_fct(
                        seq_relationship_score.view(-1, 2), is_next.view(-1))
                    total_loss = next_sentence_loss

                model.zero_grad()
                loss = total_loss
                if step % 200 == 0:
                    print(loss)
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_steps, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
            if epoch % 5 == 0:
                # Save a trained model
                logger.info("** ** * Saving fine - tuned model ** ** * ")
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                checkpoint_prefix = 'checkpoint' + str(epoch)
                output_dir = os.path.join(
                    args.output_dir, '{}-{}'.format(checkpoint_prefix,
                                                    global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                if args.do_train:
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
Beispiel #14
0
import torch
from transformers import BertTokenizer, BertForPreTraining

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

# 疑似誤り文(ノイズ)を生成し、img2infoの辞書で保管。
import json
import random
import pickle
import nltk
# word_tokenize
nltk.download('punkt')
# pos_tag
nltk.download('averaged_perceptron_tagger')
# wordnet
from nltk.corpus import wordnet as wn
from tqdm import tqdm


def build_img2info(json_obj, sim_value):
    # 画像のidをkey (key, caption, noise caption)をvalue
    img2info = {}
    idx = 0
    for dic in tqdm(json_obj.values(), total=len(json_obj)):
        new_noise = []
        for caption in dic['captions']:
            noise_captions = []
            # 形態素解析
            morph = nltk.word_tokenize(caption.lower())
            pos = nltk.pos_tag(morph)
Beispiel #15
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    cudnn.deterministic = True

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model 'bert'")
    model = BertForPreTraining.from_pretrained('bert-base-uncased',
                                               return_dict=True)

    if not torch.cuda.is_available():
        print('using CPU, this will be slow')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            #  args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = BertPretrainingCriterion(vocab_size)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    args.max_predictions_per_seq = 80

    # Data loading code
    traindir = os.path.join(args.data)
    epoch = 0
    training_steps = 0

    writer = None
    enable_tensorboard = args.rank <= 0
    if enable_tensorboard:
        if args.rank == -1:
            # No DDP:
            writer = SummaryWriter(comment='_bert_no_ddp_' + args.data)
        else:
            writer = SummaryWriter(comment='_bert_' + args.dist_backend + '_' +
                                   str(args.world_size) + 'GPUs_' + args.data)

    train_raw_start = time.time()
    while True:
        batch_time = AverageMeter('Time', ':6.3f')
        data_time = AverageMeter('Data', ':6.3f')
        example_speed = AverageMeter('Speed', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')

        files = [
            os.path.join(traindir, f) for f in os.listdir(traindir)
            if os.path.isfile(os.path.join(traindir, f)) and 'training' in f
        ]
        files.sort()
        num_files = len(files)
        random.Random(args.seed + epoch).shuffle(files)
        f_start_id = 0
        if torch.distributed.is_initialized() and get_world_size() > num_files:
            remainder = get_world_size() % num_files
            data_file = files[(f_start_id * get_world_size() + get_rank() +
                               remainder * f_start_id) % num_files]
        else:
            data_file = files[(f_start_id * get_world_size() + get_rank()) %
                              num_files]

        previous_file = data_file
        train_data = pretraining_dataset(data_file,
                                         args.max_predictions_per_seq)
        if args.distributed:
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_data, shuffle=False)
        else:
            train_sampler = torch.utils.data.RandomSampler(train_data)

        train_dataloader = torch.utils.data.DataLoader(
            train_data,
            sampler=train_sampler,
            batch_size=args.batch_size,
            num_workers=4,
            pin_memory=True)

        pool = ProcessPoolExecutor(1)
        shared_file_list = {}

        for f_id in range(f_start_id + 1, len(files)):
            if get_world_size() > num_files:
                data_file = files[(f_id * get_world_size() + get_rank() +
                                   remainder * f_id) % num_files]
            else:
                data_file = files[(f_id * get_world_size() + get_rank()) %
                                  num_files]

            previous_file = data_file
            dataset_future = pool.submit(create_pretraining_dataset, data_file,
                                         args.max_predictions_per_seq,
                                         shared_file_list, args)
            train_iter = train_dataloader
            end = time.time()
            progress = ProgressMeter(
                len(train_iter),
                [batch_time, data_time, example_speed, losses],
                prefix="Epoch: [{}]".format(epoch))

            for step, batch in enumerate(train_iter):
                training_steps += 1
                batch = [t.to(args.gpu) for t in batch]
                input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
                outputs = model(input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask)
                prediction_scores = outputs.prediction_logits
                seq_relationship_score = outputs.seq_relationship_logits
                loss = criterion(prediction_scores, seq_relationship_score,
                                 masked_lm_labels, next_sentence_labels)
                losses.update(loss.item())

                # compute gradient and do SGD step
                # optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                for param in model.parameters():
                    param.grad = None

                # measure elapsed time
                elapsed_time = time.time() - end
                batch_time.update(elapsed_time)
                end = time.time()
                speed = len(batch[0]) / elapsed_time
                example_speed.update(speed)
                global global_steps
                global global_examples

                global_examples += len(batch[0])
                global_steps += 1

                if step % args.print_freq == 0:
                    progress.display(step)
                    if writer is not None:
                        writer.add_scalar('loss/step', loss.item(),
                                          global_steps)
                        writer.add_scalar('speed/step', speed, global_steps)

                if global_steps >= (args.max_step / abs(args.world_size)):
                    break

            if global_steps >= (args.max_step / abs(args.world_size)):
                break

            del train_dataloader
            train_dataloader, data_file = dataset_future.result(timeout=None)

        now = time.time()
        print('Global Steps: ' + str(global_steps))
        print('Total Examples: ' + str(global_examples))
        print('Train duration: ' + str(now - train_raw_start))
        print('Example/Sec: ' + str(global_examples / (now - train_raw_start)))
        epoch += 1
        if epoch >= args.epochs:
            break

    if writer is not None:
        writer.add_scalar('overall_speed/step',
                          global_examples / (now - train_raw_start),
                          global_steps)
        writer.close()
Beispiel #16
0
def train():
    logger.info('*' * 64)
    logger.info('token:%s' % current_time)
    logger.info('*' * 64)

    parser = ArgumentParser()
    parser.add_argument(
        "--train_file",
        type=str,
        default="./my_test/data/student/part1.txt",
        help="Path or url of the dataset. If empty download from S3.")

    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./cache/',
                        help="Path or url of the dataset cache")
    parser.add_argument("--batch_size",
                        type=int,
                        default=2,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=1,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-4,
                        help="Learning rate")
    # parser.add_argument("--train_precent", type=float, default=0.7, help="Batch size for validation")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=1,
                        help="Number of training epochs")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    # parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--log_step",
                        type=int,
                        default=1,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--base_model", type=str, default="bert-base-uncased")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )

    args = parser.parse_args()
    logger.info(args)
    device = torch.device(args.device)
    tokenizer = BertTokenizer.from_pretrained(args.base_model)

    train_dataset = BERTDataset(args.train_file,
                                tokenizer,
                                seq_len=args.max_seq_length,
                                corpus_lines=None,
                                on_memory=args.on_memory)
    train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size)

    model = BertForPreTraining.from_pretrained(args.base_model)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    steps = len(train_data_loader.dataset) // train_data_loader.batch_size
    steps = steps if steps > 0 else 1
    logger.info('steps:%d' % steps)

    lr_warmup = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                num_warmup_steps=1500,
                                                num_training_steps=steps *
                                                args.n_epochs)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        gpu_num = torch.cuda.device_count()
        gpu_list = [int(i) for i in range(gpu_num)]
        model = DataParallel(model, device_ids=gpu_list)
        multi_gpu = True

    if torch.cuda.is_available():
        model.cuda()

    # model.to(device)
    # criterion.to(device)

    def update(engine, batch):
        model.train()
        # input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
        """
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        masked_lm_labels=None,
        next_sentence_label=None,
        """
        # loss = model(input_ids=batch[0],input_mask=batch[1],segment_ids=batch[2],lm_label_ids=batch[3],is_next=batch[4])

        loss = model(input_ids=batch[0],
                     attention_mask=batch[1],
                     position_ids=batch[2],
                     masked_lm_labels=batch[3],
                     next_sentence_label=batch[4])

        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        lr_warmup.step()
        if multi_gpu:
            loss = loss.mean()
        loss.backward()

        return loss.cpu().item()

    trainer = Engine(update)

    # def inference(engine, batch):
    #     model.eval()
    #     with torch.no_grad():
    #         input_ids = batch[0].to(device)
    #         attention_mask = batch[1].to(device)
    #         labels = batch[2].to(device)
    #         output = model(input_ids=input_ids, attention_mask=attention_mask)
    #
    #         predict = output.permute(1, 2, 0)
    #         trg = labels.permute(1, 0)
    #         loss = criterion(predict.to(device), trg.to(device))
    # return predict, trg
    #
    # evaluator = Engine(inference)
    # metrics = {"nll": Loss(criterion, output_transform=lambda x: (x[0], x[1])),
    #            "accuracy": Accuracy(output_transform=lambda x: (x[0], x[1]))}
    # for name, metric in metrics.items():
    #     metric.attach(evaluator, name)
    #
    # @trainer.on(Events.EPOCH_COMPLETED)
    # def log_validation_results(trainer):
    #     evaluator.run(valid_data_loader)
    #     ms = evaluator.state.metrics
    #     logger.info("Validation Results - Epoch: [{}/{}]  Avg accuracy: {:.6f} Avg loss: {:.6f}"
    #           .format(trainer.state.epoch, trainer.state.max_epochs, ms['accuracy'], ms['nll']))

    #
    '''======================early stopping =========================='''
    # def score_function(engine):
    #     val_loss = engine.state.metrics['nll']
    #     return -val_loss
    # handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer)
    # evaluator.add_event_handler(Events.COMPLETED, handler)
    '''==================print information by iterator========================='''

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        if trainer.state.iteration % args.log_step == 0:
            logger.info("Epoch[{}/{}] Step[{}/{}] Loss: {:.6f}".format(
                trainer.state.epoch, trainer.state.max_epochs,
                trainer.state.iteration % steps, steps,
                trainer.state.output * args.gradient_accumulation_steps))

    '''================add check point========================'''
    checkpoint_handler = ModelCheckpoint(checkpoint_dir,
                                         'checkpoint',
                                         n_saved=3)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED, checkpoint_handler,
        {'BertClassificationModel': getattr(model, 'module', model)
         })  # "getattr" take care of distributed encapsulation
    '''==============run trainer============================='''
    trainer.run(train_data_loader, max_epochs=args.n_epochs)
Beispiel #17
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):

    print("Converting TensorFlow checkpoint from {} with config at {}".format(
        tf_checkpoint_path, bert_config_file))

    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_checkpoint_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        print("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_checkpoint_path, name)
        names.append(name)
        arrays.append(array)

    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    for name, array in zip(names, arrays):
        name = name.split('/')
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
            print("Skipping {}".format("/".join(name)))
            continue
        pointer = model
        for m_name in name:
            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                l = re.split(r'_(\d+)', m_name)
            else:
                l = [m_name]
            if l[0] == 'kernel' or l[0] == 'gamma':
                pointer = getattr(pointer, 'weight')
            elif l[0] == 'output_bias' or l[0] == 'beta':
                pointer = getattr(pointer, 'bias')
            elif l[0] == 'output_weights':
                pointer = getattr(pointer, 'weight')
            else:
                pointer = getattr(pointer, l[0])
            if len(l) >= 2:
                num = int(l[1])
                pointer = pointer[num]
        if m_name[-11:] == '_embeddings':
            pointer = getattr(pointer, 'weight')
        elif m_name == 'kernel':
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        print("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)


# if __name__=='__main__':
#     convert_tf_checkpoint_to_pytorch(config.TF_PATH,config.BERT_CONFIG_FILE,config.BERT_WEIGHTS)
Beispiel #18
0
def convert_pytorch_checkpoint_to_tf(model: BertForPreTraining, ckpt_dir: str,
                                     model_name: str):
    """
    Args:
        model: BertModel Pytorch model instance to be converted
        ckpt_dir: Tensorflow model directory
        model_name: model name
    Currently supported HF models:
        - Y BertModel
        - N BertForMaskedLM
        - N BertForPreTraining
        - N BertForMultipleChoice
        - N BertForNextSentencePrediction
        - N BertForSequenceClassification
        - N BertForQuestionAnswering
    """

    tensors_to_transpose = ("dense.weight", "attention.self.query",
                            "attention.self.key", "attention.self.value")

    var_map = (
        ("layer.", "layer_"),
        ("word_embeddings.weight", "word_embeddings"),
        ("position_embeddings.weight", "position_embeddings"),
        ("token_type_embeddings.weight", "token_type_embeddings"),
        ("cls.predictions.bias", "cls.predictions.output_bias"),
        (".", "/"),
        ("LayerNorm/weight", "LayerNorm/gamma"),
        ("LayerNorm/bias", "LayerNorm/beta"),
        ("weight", "kernel"),
        ("cls/seq_relationship/bias", "cls/seq_relationship/output_bias"),
        ("cls/seq_relationship/kernel", "cls/seq_relationship/output_weights"),
    )

    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)

    state_dict = model.state_dict()

    def to_tf_var_name(name: str):
        for patt, repl in iter(var_map):
            name = name.replace(patt, repl)
        return "bert/{}".format(name) if not name.startswith("cls") else name

    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
        tf_var = tf.get_variable(dtype=tf_dtype,
                                 shape=tensor.shape,
                                 name=name,
                                 initializer=tf.zeros_initializer())
        session.run(tf.variables_initializer([tf_var]))
        session.run(tf_var)
        return tf_var

    tf.reset_default_graph()
    with tf.Session() as session:
        for var_name in state_dict:
            tf_name = to_tf_var_name(var_name)
            torch_tensor = state_dict[var_name].numpy()
            if any([x in var_name for x in tensors_to_transpose]):
                torch_tensor = torch_tensor.T
            tf_var = create_tf_var(tensor=torch_tensor,
                                   name=tf_name,
                                   session=session)
            tf.keras.backend.set_value(tf_var, torch_tensor)
            tf_weight = session.run(tf_var)
            print("Successfully created {}: {}".format(
                tf_name, np.allclose(tf_weight, torch_tensor)))

        saver = tf.train.Saver(tf.trainable_variables())
        saver.save(
            session,
            os.path.join(ckpt_dir,
                         model_name.replace("-", "_") + ".ckpt"))
    def __init__(self, metadata, timer, is_ZH, data_manager):
        super().__init__()
        self.timer = timer
        self.timer("bert-init")
        self.batch_per_train = 50
        self.batch_size_eval = 64
        self.max_seq_len = 301
        self.batch_size = 48
        self.weight_decay = 0
        self.learning_rate = 5e-5
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.
        self.total_epoch = 100
        self.logging_step = -1
        self.warmup_steps = 0
        self.metadata = metadata
        self.num_class = self.metadata.get_output_size()

        self.bert_folder = extract_bert_model()

        bertConfig = BertConfig.from_json_file(self.bert_folder +
                                               '/config.json')
        self.model = BertClassification(None, bertConfig, self.num_class)

        self.bertTokenizer = BertTokenizer.from_pretrained(self.bert_folder)
        bertModel = BertForPreTraining.from_pretrained(
            self.bert_folder, num_labels=self.num_class, from_tf=BERT_V == 0)
        self.model.bert = bertModel.bert
        del bertModel.cls
        self.model.to(torch.device("cuda"))
        self.data = data_manager
        self.data.add_pipeline(
            BertPipeline(is_ZH,
                         metadata,
                         self.bertTokenizer,
                         max_length=self.max_seq_len))
        self.train_data_loader = None
        self.test_data_loader = None
        self.valid_data_loader = None
        self.done_training = False
        self.estimate_time_per_batch = None
        self.estimate_valid_time = None
        self.estimate_test_time = None

        # init optimizer and scheduler
        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.weight_decay,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]

        self.optimizer = AdamW(optimizer_grouped_parameters,
                               lr=self.learning_rate,
                               eps=self.adam_epsilon)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.total_epoch * self.batch_per_train)

        # first, we only train the classifier
        self.optimizer_only_classifier = optim.Adam(
            self.model.classifier.parameters(), 0.0005)

        self.place = 'cpu'

        self.timer("bert-init")
        print('[bert init] time cost: %.2f' %
              (self.timer.accumulation["bert-init"]))
Beispiel #20
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name"
        )

    if model_args.model_name_or_path:
        model = BertForPreTraining.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = BertForPreTraining.from_config(config)
    
    if model_args.cls_model_name_or_path:
        cls_config = AutoConfig.from_pretrained(
            model_args.cls_model_name_or_path,
            num_labels=2,
            finetuning_task="cola",
            cache_dir=model_args.cache_dir,
        )
        cls_model = AutoModelForSequenceClassification.from_pretrained(
            model_args.cls_model_name_or_path,
            from_tf=bool(".ckpt" in model_args.cls_model_name_or_path),
            config=cls_config,
            cache_dir=model_args.cache_dir,
        )
        cls_model.resize_token_embeddings(len(tokenizer))
        mask_selector = MaskSelector(cls_model,training_args)

    model.resize_token_embeddings(len(tokenizer))
    

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling)."
        )

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(data_args, tokenizer=tokenizer, model_args=model_args, cache_dir=model_args.cache_dir) if training_args.do_train else None
    eval_dataset = get_dataset(data_args, model_args=None, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None

   
    data_collator = DataCollatorForMixLM(
        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
    )
    

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
Beispiel #21
0
        labels = data['bert_label'].to(device).long()
        optim.zero_grad()
        outputs = model(input_ids=input_ids,
                        token_type_ids=token_type_ids,
                        attention_mask=attention_mask,
                        labels=labels,
                        next_sentence_label=next_sentence_label)
        loss = outputs['loss']
        losses.append(loss.cpu().detach().numpy())
    loss = np.mean(losses)
    return loss


device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = BertConfig(vocab_size=len(WORDS) + 1)
model = BertForPreTraining.from_pretrained('bert-base-chinese')
model = model.to(device)
# model=nn.DataParallel(model,device_ids=[0,1])
optim = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    pbar = tqdm(train_loader)
    losses = []
    for data_label in pbar:
        data = data_label[0]
        next_sentence_label = data_label[1].to(device).long()

        input_ids = data['input_ids'].to(device).long()
        token_type_ids = data['token_type_ids'].to(device).long()
        attention_mask = data['attention_mask'].to(device).long()
Beispiel #22
0
    def __init__(self, hparams):
        """
        input:
            hparams: namespace with the following items:
                'data_dir' (str): Data Directory. default: './official/ebm_nlp_1_00'
                'bioelmo_dir' (str): BioELMo Directory. default: './models/bioelmo', help='BioELMo Directory')
                'max_length' (int): Max Length. default: 1024
                'lr' (float): Learning Rate. default: 1e-2
                'fine_tune_bioelmo' (bool): Whether to Fine Tune BioELMo. default: False
                'lr_bioelmo' (float): Learning Rate in BioELMo Fine-tuning. default: 1e-4
        """
        super().__init__()
        self.hparams = hparams
        self.itol = ID_TO_LABEL
        self.ltoi = {v: k for k, v in self.itol.items()}

        if self.hparams.model == "bioelmo":
            # Load Pretrained BioELMo
            DIR_ELMo = pathlib.Path(str(self.hparams.bioelmo_dir))
            self.bioelmo = self.load_bioelmo(
                DIR_ELMo, not self.hparams.fine_tune_bioelmo
            )
            self.bioelmo_output_dim = self.bioelmo.get_output_dim()

            # ELMo Padding token (In ELMo token with ID 0 is used for padding)
            VOCAB_FILE_PATH = DIR_ELMo / "vocab.txt"
            command = shlex.split(f"head -n 1 {VOCAB_FILE_PATH}")
            res = subprocess.Popen(command, stdout=subprocess.PIPE)
            self.bioelmo_pad_token = res.communicate()[0].decode("utf-8").strip()

            # Initialize Intermediate Affine Layer
            self.hidden_to_tag = nn.Linear(int(self.bioelmo_output_dim), len(self.itol))

        elif self.hparams.model == "biobert":
            # Load Pretrained BioBERT
            PATH_BioBERT = pathlib.Path(str(self.hparams.biobert_path))
            self.bertconfig = BertConfig.from_pretrained(self.hparams.bert_model_type)
            self.bertforpretraining = BertForPreTraining(self.bertconfig)
            self.bertforpretraining.load_tf_weights(self.bertconfig, PATH_BioBERT)
            self.biobert = self.bertforpretraining.bert
            self.tokenizer = BertTokenizer.from_pretrained(self.hparams.bert_model_type)

            # Freeze BioBERT if fine-tune not desired
            if not self.hparams.fine_tune_biobert:
                for n, m in self.biobert.named_parameters():
                    m.requires_grad = False

            # Initialize Intermediate Affine Layer
            self.hidden_to_tag = nn.Linear(
                int(self.bertconfig.hidden_size), len(self.itol)
            )

        # Initialize CRF
        TRANSITIONS = conditional_random_field.allowed_transitions(
            constraint_type="BIO", labels=self.itol
        )
        self.crf = conditional_random_field.ConditionalRandomField(
            # set to 3 because here "tags" means ['O', 'B', 'I']
            # no need to include 'BOS' and 'EOS' in "tags"
            num_tags=len(self.itol),
            constraints=TRANSITIONS,
            include_start_end_transitions=False,
        )
        self.crf.reset_parameters()
Beispiel #23
0
    def load_annotations(self, proposal_method, **kwargs):
        logger = logging.getLogger("vmr.trainer")
        logger.info("Preparing data form file {}, please wait...".format(
            self.anno_file))
        self.annos = []
        self.gts = []
        word2vec_cache_prefix = os.path.splitext(self.anno_file)[0]
        word2vec_cache_file = '{}_word2vec_{}.pkl'.format(
            word2vec_cache_prefix, self.word2vec)

        # Define word embedding function
        if os.path.exists(word2vec_cache_file):
            annos_original = None
            # Load word embeddings cache if exists
            logger.info("Word2vec cache exist, load cache file.")
            with open(word2vec_cache_file, 'rb') as F:
                self.annos_query = pickle.load(F)

            def word_embedding(idx, sentence):
                assert self.annos_query[idx]['sentence'] == sentence, \
                    'annotation file {} has been modified, cache file expired!'.format(self.anno_file,)
                return self.annos_query[idx]['query'], self.annos_query[idx][
                    'wordlen']
        else:
            annos_original = []
            # Computing word embeddings if there's no cache
            if self.word2vec == 'BERT':
                # Here we use second-to-last hidden layer
                # See 3.5 Pooling Strategy & Layer Choice in https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#3-extracting-embeddings
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                bert = BertForPreTraining.from_pretrained('bert-base-uncased',
                                                          return_dict=True)
                bert.to('cuda')

                def word_embedding(idx, sentence):
                    sentence_tokenized = tokenizer(
                        sentence,
                        return_tensors="pt")  # token_num = sentence_num+2
                    for key in sentence_tokenized:
                        sentence_tokenized[key] = sentence_tokenized[key].to(
                            'cuda')
                    with torch.no_grad():
                        query = bert(**sentence_tokenized,
                                     output_hidden_states=True
                                     )['hidden_states'][-2].squeeze_().to(
                                         'cpu')  #(token_num, 768)
                        query = query[1:-1]
                    return query, query.size(
                        0)  #(sentence_len, 768) including punctuations
            elif self.word2vec == 'GloVe':

                def word_embedding(idx, sentence):
                    word2vec = glove_embedding(sentence)
                    return word2vec, word2vec.size(
                        0)  #(sentence_len, 300) including punctuations
            else:
                raise NotImplementedError

        # Loading annotations, generate ground truth for model proposal
        logger.info("loading annotations ...")
        with open(self.anno_file, 'r') as f:
            annos = json.load(f)
        for vid, anno in tqdm(annos.items()):
            duration = anno[
                'duration'] if self.dataset_name != 'tacos' else anno[
                    'num_frames'] / anno['fps']
            # Produce annotations
            for idx in range(len(anno['timestamps'])):
                timestamp = anno['timestamps'][idx]
                sentence = anno['sentences'][idx]
                if timestamp[0] < timestamp[1]:
                    moment = torch.tensor([max(timestamp[0], 0), min(timestamp[1], duration)]) if self.dataset_name != 'tacos' \
                    else torch.tensor(
                            [max(timestamp[0]/anno['fps'],0),
                            min(timestamp[1]/anno['fps'],duration)]
                        )
                    query, wordlen = word_embedding(len(self.annos), sentence)
                    self.avg_wordvec += query.mean(dim=0)
                    if annos_original is not None:
                        annos_original.append({
                            'sentence': sentence,
                            'query': query,
                            'wordlen': wordlen,
                        })
                    adjmat = torch.tensor(anno['dependency_parsing_graph']
                                          [idx]) if self.dep_graph else None
                    if self.consti_mask:
                        constimask = torch.tensor(
                            anno['constituency_parsing_mask'][idx],
                            dtype=torch.float32)
                        layers = torch.linspace(
                            constimask.size(0) - 1, 0, self.tree_depth).long(
                            )  # The original tree is from root to leaf
                        constimask = constimask[layers, :, :]
                    else:
                        constimask = None
                    if self.dep_graph:
                        padding = query.size(0) - adjmat.size(0)
                    adjmat = torch.nn.functional.pad(
                        adjmat,
                        (0, padding, 0,
                         padding), "constant", 0) if self.dep_graph else None
                    if wordlen >= self.max_num_words:
                        wordlen = self.max_num_words
                        query = query[:self.max_num_words]
                        adjmat = adjmat[:self.max_num_words, :self.
                                        max_num_words] if self.dep_graph else None
                    elif self.fix_num_words:
                        padding = self.max_num_words - wordlen
                        query = torch.nn.functional.pad(
                            query, (0, 0, 0, padding), "constant", 0)
                        #print('padded:', query.shape)
                        if self.dep_graph:
                            padding = self.max_num_words - adjmat.size(0)
                        adjmat = torch.nn.functional.pad(
                            adjmat, (0, padding, 0, padding), "constant",
                            0) if self.dep_graph else None

                    self.annos.append({
                        'vid':
                        vid,
                        'moment':
                        moment,
                        'sentence':
                        sentence,
                        'query':
                        query,
                        'querymask':
                        torch.ones(wordlen, dtype=torch.int32),
                        'adjmat':
                        adjmat,
                        'constimask':
                        constimask,
                        'wordlen':
                        wordlen,
                        'duration':
                        duration,
                    })
                    gt_dict = self.__generate_ground_truth__(
                        moment, duration, proposal_method, **kwargs)
                    self.gts.append(gt_dict)

        self.avg_wordvec /= len(self.annos)

        if not os.path.exists(word2vec_cache_file):
            with open(word2vec_cache_file, 'wb') as F:
                word2vec_cache = [{
                    'sentence': anno['sentence'],
                    'query': anno['query'],
                    'wordlen': anno['wordlen']
                } for anno in annos_original]
                pickle.dump(word2vec_cache, F)

        # Loading visual features if in_memory
        if self.in_memory:
            logger.info(
                "Loading visual features from {}, please wait...".format(
                    self.feat_file))
            self.feats, self.seglens = video2feats(self.feat_file,
                                                   annos.keys(),
                                                   self.num_segments,
                                                   self.dataset_name,
                                                   self.upsample)
        logger.info("Dataset prepared!")
Beispiel #24
0
        pretraindata, BATCH_SIZE, collate_fn=pretrain_collate_fn
    )

    # Set the config of the bert
    config = BertConfig(
        num_hidden_layers=4,
        hidden_size=312,
        intermediate_size=1200,
        max_position_embeddings=1024,
    )

    if args.target == "mobert":
        config.num_labels = pretraindata.token_num + 1
        model = MoBert(config)
    elif args.target == "bert":
        model = BertForPreTraining(config)
    model = model.to(device)

    # Pre-train the MoBERT model
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    model.train()

    step = 1
    total_loss = 0
    total_loss_pre = 0
    total_loss_cl = 0
    start = time.time()
    for src, mlm, mask, nsp, mt, token_type_ids in dataloader:
        src = src.to(device)
        mlm = mlm.to(device)
        mask = mask.to(device)
    def __init__(self,
                 scan_encoder_class=None,
                 scan_encoder_args={},
                 bert_class=None,
                 bert_args={},
                 scan_decoder_class=None,
                 scan_decoder_args={},
                 task_configs=[],
                 vocab_args={},
                 loss_weighting=None,
                 optim_class="Adam",
                 optim_args={},
                 scheduler_class=None,
                 scheduler_args={},
                 pretrained_configs=[],
                 cuda=True,
                 devices=[0]):
        """
        """
        super().__init__(optim_class, optim_args, scheduler_class,
                         scheduler_args, pretrained_configs, cuda, devices)

        self.encodes_scans = scan_encoder_class is not None
        if self.encodes_scans:
            self.scan_encoder = getattr(
                modules, scan_encoder_class)(**scan_encoder_args)
            self.scan_encoder = nn.DataParallel(self.scan_encoder,
                                                device_ids=self.devices)

        if bert_class == "BertModelPreTrained":
            self.bert = BertModel.from_pretrained(**bert_args)
        elif bert_class == "BertForPretraining":
            self.bert = BertForPreTraining.from_pretrained(**bert_args)
        elif bert_class == "BertModel":
            bert_args["config"] = BertConfig.from_dict(bert_args["config"])
            self.bert = BertModel(**bert_args)
        else:
            self.bert = getattr(modules, bert_class)(**bert_args)
        self.bert = nn.DataParallel(self.bert, device_ids=self.devices)

        self.decodes_scans = scan_decoder_class is not None
        if self.decodes_scans:
            self.scan_decoder = getattr(
                modules, scan_decoder_class)(**scan_decoder_args)

        self.task_heads = {}
        self.task_inputs = {}
        for task_head_config in task_configs:
            task = task_head_config["task"]
            head_class = getattr(modules, task_head_config["class"])
            args = task_head_config["args"]
            self.task_inputs[task] = (task_head_config["inputs"] if "inputs"
                                      in task_head_config else "pool")

            if "config" in args:
                # bert task heads take config object for parameters, must convert from dict
                config = args["config"]
                args["config"] = namedtuple("Config",
                                            config.keys())(*config.values())

            if head_class is BertOnlyMLMHead:
                embs = self.bert.module.embeddings.word_embeddings.weight
                self.task_heads[task] = head_class(
                    bert_model_embedding_weights=embs, **args)
            else:
                self.task_heads[task] = head_class(**args)

        self.task_heads = torch.nn.ModuleDict(self.task_heads)

        self.vocab = WordPieceVocab(**vocab_args)

        self._build_loss(loss_weighting)

        self._post_init()
def main():
    args = get_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # not parallizing across GPUs because of deadlocks
    n_gpu = 1 if torch.cuda.device_count() > 0 else 0

    logging.info(f'device: {device} n_gpu: {n_gpu} seed: {args.seed}')
    res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    logging.info(
        f'mem: {res.used / (1024**2)} (GiB) ({100 * (res.used / res.total):.3f}%)'
    )

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    # TODO: not sure what this for loop is doing
    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = total_train_examples // args.train_batch_size

    # Prepare model
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    model = BertForPreTraining.from_pretrained(args.bert_model)
    model.to(device)

    # Prepare optimizer
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        tmp_fp = f'/media/data_1/darius/data/512epoch_{epoch}_dataset_255.pkl'
        if Path(tmp_fp).is_file():
            logging.info(f'Loading dataset from {tmp_fp}...')
            with open(tmp_fp, 'rb') as f:
                epoch_dataset = pickle.load(f)
        else:
            epoch_dataset = PregeneratedDataset(
                epoch=epoch,
                training_path=args.pregenerated_data,
                tokenizer=tokenizer,
                num_data_epochs=num_data_epochs,
                reduce_memory=args.reduce_memory)
            with open(tmp_fp, 'wb') as f:
                pickle.dump(epoch_dataset, f, protocol=4)
        train_sampler = RandomSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for _, (input_ids, input_mask, segment_ids, lm_label_ids,
                    is_next) in enumerate(train_dataloader):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                lm_label_ids = lm_label_ids.to(device)
                is_next = is_next.to(device)
                # breakpoint()
                outputs = model(input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=lm_label_ids,
                                next_sentence_label=is_next)
                # outputs = model(input_ids, segment_ids,
                #                 input_mask, lm_label_ids, is_next)
                loss = outputs.loss
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                optimizer.step()
                # optimizer.zero_grad()
                scheduler.step()
                global_step += 1

    # Save a trained model
    logging.info("** ** * Saving fine-tuned model ** ** * ")
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self

    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)
    model_to_save.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(args.output_dir)
Beispiel #27
0
DATAPATH = '/datasets/shshi'
pretrained_path = '%s/pretrained' % DATAPATH

if args.model == 'bert_base':
    config = BertConfig.from_json_file('bert_base_config.json')
else:
    config = BertConfig.from_json_file('bert_config.json')
#config = BertConfig.from_json_file('bert_config.json')
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
    config.vocab_size += 8 - (config.vocab_size % 8)

vocab_size = config.vocab_size
#tokenizer = BertTokenizer.from_pretrained(pretrained_path)
#model = BertForPreTraining.from_pretrained(pretrained_path)
model = BertForPreTraining(config)

if args.cuda:
    model.cuda()

#optimizer = AdamW(model.parameters(),
optimizer = optim.Adam(
    model.parameters(),
    lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
    eps=1e-8  # args.adam_epsilon  - default is 1e-8.
)
#optimizer = optim.SGD(model.parameters(), lr=2e-5)

compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
use_bytescheduler = True
import bytescheduler.pytorch.horovod as bsc
    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig

    config = BertConfig(vocab_size=32000,
                        hidden_size=256,
                        num_hidden_layers=6,
                        num_attention_heads=4,
                        intermediate_size=3072,
                        hidden_act="gelu",
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
                        max_position_embeddings=512,
                        type_vocab_size=2,
                        pad_token_id=0,
                        position_embedding_type="absolute")

    model = BertForPreTraining(config=config)
    model.num_parameters()

    train_dataset = TextDatasetForNextSentencePrediction(
        tokenizer=tokenizer,
        file_path='/opt/ml/code/KBOBERT/KBOBERT_Data.txt',
        block_size=512,
        overwrite_cache=False,
        short_seq_probability=0.1,
        nsp_probability=0.5,
    )

    # eval_dataset = TextDatasetForNextSentencePrediction(
    #     tokenizer=tokenizer,
    #     file_path='/opt/ml/code/KBOBERT/wiki_20190620_small.txt',
    #     block_size=512,
from torch.nn import CrossEntropyLoss
from common import AverageMeter
from custom_metrics import LMAccuracy
from data_loader import Data_pretrain
from config import Config

if __name__ == '__main__':
    #  training_path, file_id, tokenizer, data_name, reduce_memory=False
    tokenizer = BertTokenizer.from_pretrained('./bert_base_pretrain/vocab.txt')
    train_data_path = './data/processed_data0.json'
    txt = Data_pretrain(train_data_path, tokenizer)
    data_iter = DataLoader(txt, shuffle=True, batch_size=2)
    bert_config = BertConfig.from_pretrained(Config.config_path)

    # model = BertForPreTraining(config=bert_config)
    model = BertForPreTraining.from_pretrained(
        './bert_base_pretrain/pytorch_model.bin', config=bert_config)
    model.to(Config.device)
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
Beispiel #30
0
        def create_and_check_bert_for_pretraining(self, config, input_ids,
                                                  token_type_ids, input_mask,
                                                  sequence_labels,
                                                  token_labels, choice_labels):
            seed = 42
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            onnxruntime.set_seed(seed)

            model = BertForPreTraining(config=config)
            model.eval()
            loss, prediction_scores, seq_relationship_score = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                masked_lm_labels=token_labels,
                next_sentence_label=sequence_labels)
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc, self.masked_lm_labels_desc,
                self.next_sentence_label_desc
            ], [
                self.loss_desc, self.prediction_scores_desc,
                self.seq_relationship_scores_desc
            ])

            from collections import namedtuple
            MyArgs = namedtuple(
                "MyArgs",
                "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len"
            )
            args = MyArgs(local_rank=0,
                          world_size=1,
                          max_steps=100,
                          learning_rate=0.00001,
                          warmup_proportion=0.01,
                          batch_size=13,
                          seq_len=7)

            def get_lr_this_step(global_step):
                return get_lr(args, global_step)

            loss_scaler = LossScaler('loss_scale_input_name',
                                     True,
                                     up_scale_window=2000)

            # It would be better to test both with/without mixed precision and allreduce_post_accumulation.
            # However, stress test of all the 4 cases is not stable at lease on the test machine.
            # There we only test mixed precision and allreduce_post_accumulation because it is the most useful use cases.
            option_fp16 = [True]
            option_allreduce_post_accumulation = [True]
            option_gradient_accumulation_steps = [1, 8]
            option_use_internal_get_lr_this_step = [True, False]
            option_use_internal_loss_scaler = [True, False]
            option_split_batch = [BatchArgsOption.ListAndDict]

            for fp16 in option_fp16:
                for allreduce_post_accumulation in option_allreduce_post_accumulation:
                    for gradient_accumulation_steps in option_gradient_accumulation_steps:
                        for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step:
                            for use_internal_loss_scaler in option_use_internal_loss_scaler:
                                for split_batch in option_split_batch:
                                    print("gradient_accumulation_steps:",
                                          gradient_accumulation_steps)
                                    print("use_internal_loss_scaler:",
                                          use_internal_loss_scaler)
                                    loss_ort, prediction_scores_ort, seq_relationship_score_ort =\
                                        run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16,
                                                allreduce_post_accumulation,
                                                get_lr_this_step, use_internal_get_lr_this_step,
                                                loss_scaler, use_internal_loss_scaler,
                                                split_batch)

                                    print(loss_ort)
                                    print(prediction_scores_ort)
                                    print(seq_relationship_score_ort)