def main():

    bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
    bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config)
    count = 0
    for name, param in bert_base_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in bert_base_uncased: ', count)

    roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
    roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config)
    count = 0
    for name, param in roberta_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in roberta: ', count)

    albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2)
    albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config)
    count = 0
    for name, param in albert_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in albert: ', count)
Beispiel #2
0
def get_albert_for_comparison():
    model_name = 'albert-base-v2'
    config = AlbertConfig.from_pretrained(model_name)
    config.output_hidden_states = False

    input_ids = tf.keras.Input(shape=(128, ), name='input_ids', dtype=tf.int32)
    attention_mask = tf.keras.Input(shape=(128, ),
                                    name='attention_mask',
                                    dtype=tf.int32)

    transformer_model = TFAlbertModel.from_pretrained(model_name,
                                                      config=config)
    embedding_layer = transformer_model([input_ids, attention_mask])[0]

    X = tf.keras.layers.Dense(
        config.hidden_size,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=config.initializer_range),
        activation="relu",
        name="pre_classifier",
    )(embedding_layer[:, 0])
    X = tf.keras.layers.Dropout(config.classifier_dropout_prob)(X)
    output_ = tf.keras.layers.Dense(
        1,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=config.initializer_range),
        name="classifier")(X)

    return tf.keras.Model([input_ids, attention_mask], output_)
Beispiel #3
0
    def __init__(self, my_config, args):
        super(NqModel, self).__init__()
        #albert_base_configuration = AlbertConfig(vocab_size=30000,hidden_size=768,num_attention_heads=12,intermediate_size=3072,
        #                                        attention_probs_dropout_prob=0)
        self.my_mask = None
        self.args = args
        #mfeb/albert-xxlarge-v2-squad2
        self.bert_config = AlbertConfig.from_pretrained("albert-xxlarge-v2")
        # self.bert_config.gradient_checkpointing = True
        # self.bert_config.Extgradient_checkpointing = True
        self.bert = AlbertModel.from_pretrained("albert-xxlarge-v2",
                                                config=self.bert_config)
        #        self.bert = AlbertModel.from_pretrained("albert-base-v2")
        my_config.hidden_size = self.bert.config.hidden_size

        self.right = 0
        self.all = 0
        #self.bert =  AlbertModel(albert_base_configuration)

        #self.bert2 = BertModel(bert_config)

        #self.bert = BertModel(BertConfig())

        #self.bert =  RobertaModel(RobertaConfig(max_position_embeddings=514,vocab_size=50265))

        #print(my_config,bert_config)
        #        self.tok_dense = nn.Linear(my_config.hidden_size, my_config.hidden_size)
        self.tok_dense = nn.Linear(my_config.hidden_size * 2,
                                   my_config.hidden_size * 2)

        #        self.tok_dense2 = nn.Linear(my_config.hidden_size, my_config.hidden_size)
        #        self.para_dense = nn.Linear(self.config.hidden_size, self.config.hidden_size)
        #        self.doc_dense = nn.Linear(self.config.hidden_size, self.config.hidden_size)

        self.dropout = nn.Dropout(my_config.hidden_dropout_prob)

        self.tok_outputs = nn.Linear(my_config.hidden_size * 2,
                                     1)  # tune to avoid fell into bad places

        #        self.tok_outputs2 = nn.Linear(my_config.hidden_size, 1)
        #        config.max_token_len, config.max_token_relative
        #        self.para_outputs = nn.Linear(self.config.hidden_size, 1)
        #        self.answer_type_outputs = nn.Linear(self.config.hidden_size, 2)

        #        self.tok_to_label = nn.Linear(my_config.max_token_len,2)
        #        self.par_to_label = nn.Linear(my_config.max_paragraph_len,2)

        #self.encoder = Encoder(my_config)
        self.encoder = Encoder(my_config)
        #        self.encoder2 = Encoder(my_config)

        self.my_config = my_config

        self.model_choice = None
        self.ground_answer = None

        self.ACC = 0
        self.ALL = 0

        self.ErrId = []
Beispiel #4
0
    def __init__(self, config):
        super(AlBert, self).__init__()
        model_config = AlbertConfig.from_pretrained(
            config.config_file,
            num_labels=config.num_labels,
            finetuning_task=config.task,
        )
        self.albert = AlbertModel.from_pretrained(
            config.model_name_or_path,
            config=model_config,
        )
        if config.requires_grad:
            for param in self.albert.parameters():
                param.requires_grad = True
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        #add the weighted layer
        self.hidden_weight = config.weighted_layer_tag         #must modify the config.json
        self.pooling_tag = config.pooling_tag

        if self.hidden_weight:
            self.weight_layer = config.weighted_layer_num
            #self.weight = torch.zeros(self.weight_layer).to(config.device)
            self.weight = torch.nn.Parameter(torch.FloatTensor(self.weight_layer), requires_grad=True)
            self.softmax = nn.Softmax()
            self.pooler = nn.Sequential(nn.Linear(768, 768), nn.Tanh())

        elif self.pooling_tag:
            self.maxPooling = nn.MaxPool1d(64)
            self.avgPooling = nn.AvgPool1d(64)
            self.pooler = nn.Sequential(nn.Linear(768*3, 768), nn.Tanh())
Beispiel #5
0
    def __init__(self, model_name, model_type):
        """
        Hyper-parameters found with validation set:
        xlnet-large-casd : epoch = 4,  learning_rate = 1E-5, batch_size = 16, epsilon = 1e-6
        bert-large-uncased : epoch = 4,  learning_rate = 3E-5, batch_size = 16, epsilon = 1e-8
        ALBERT xxlarge-v2 large : epoch = 3,  learning_rate = 5E-5, batch_size = 8, epsilon = 1e-6 to be improved...
        """
        self.model_name = model_name
        self.model_type = model_type

        # Cf transformers library, batch of 16 or 32 is advised for training. For memory issues, we will take 16. Gradient accumulation step has not lead
        # to great improvment and therefore won't be used here.
        if model_type == 'albert':
            self.batch_size = 8
        else:
            self.batch_size = 16

        available_model_name = ["xlnet-large-cased", "bert-large-uncased", "albert-xlarge-v2"]
        available_model_type = ["bert", "xlnet", "albert"]

        if self.model_name not in available_model_name:
            raise Exception("Error : model_name should be in", available_model_name)
        if self.model_type not in available_model_type:
            raise Exception("Error : model_name should be in", available_model_type)

        # Load BertForSequenceClassification, the pretrained BERT model with a single linear regression layer on top of the pooled output
        # Load our fined tune model: ex: BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
        if self.model_type == 'bert':
            self.config = BertConfig.from_pretrained(self.model_name, num_labels=1)  # num_labels=1 for regression task
            self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        elif self.model_type == 'xlnet':
            self.config = XLNetConfig.from_pretrained(self.model_name, num_labels=1)
            self.model = XLNetForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        elif self.model_type == 'albert':
            self.config = AlbertConfig.from_pretrained(self.model_name, num_labels=1)
            self.model = AlbertForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        self.model.cuda()

        if self.model_name == 'xlnet-large-cased':
            self.epochs = 4
            self.lr = 1e-5
            self.eps = 1e-6

        elif self.model_name == 'bert-large-uncased':
            self.epochs = 4
            self.lr = 3e-5
            self.eps = 1e-8

        elif self.model_name == 'albert-xxlarge-v2':
            self.epochs = 3
            self.lr = 5e-5
            self.eps = 1e-6

        self.max_grad_norm = 1.0  # Gradient threshold, gradients norms that exceed this threshold are scaled down to match the norm.

        self.optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=self.eps)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.n_gpu = torch.cuda.device_count()
        torch.cuda.get_device_name(0)
Beispiel #6
0
 def load_model(self, model_path: str, do_lower_case=True):
     config = AlbertConfig.from_pretrained(model_path + "/config.json")
     tokenizer = AlbertTokenizer.from_pretrained(model_path)
     #tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2', do_lower_case=do_lower_case)
     model = AlbertForQuestionAnswering.from_pretrained(model_path,
                                                        from_tf=False,
                                                        config=config)
     return model, tokenizer
Beispiel #7
0
def load_model_and_tokenizer(manifest):
    zipped_model_path = download_zipped_model(manifest, assert_hash=True)
    unzipped_model_dir = get_unzipped_dir_path(zipped_model_path)
    config = AlbertConfig.from_pretrained(unzipped_model_dir)
    model = TFAlbertForSequenceClassification.from_pretrained(
        unzipped_model_dir, config=config)
    tokenizer = AlbertTokenizer.from_pretrained(unzipped_model_dir)
    return model, tokenizer
Beispiel #8
0
 def __init__(self, transformer_model, is_train):
     super(LMNER, self).__init__()
     config = AlbertConfig.from_pretrained(transformer_model)
     self.transformer_model = AlbertForMaskedLM.from_pretrained(
         transformer_model, config=config)
     # 是否对bert进行训练
     for name, param in self.transformer_model.named_parameters():
         param.requires_grad = is_train
Beispiel #9
0
    def __init__(self, config):
        super(Model, self).__init__()
        self.config = AlbertConfig.from_pretrained(config.albert_config_path)
        self.albert = AlbertModel.from_pretrained(config.albert_model_path,
                                                  config=self.config)
        for param in self.albert.parameters():
            param.requires_grad = True

        self.fc = nn.Linear(config.hidden_size, config.num_classes)
Beispiel #10
0
def load_pretrained_encoder(mpath,
                            config="albert_config.json",
                            model="albert_model.bin"):

    b_config = BC.from_pretrained(opt.join(mpath, config))
    encoder = AlbertModel.from_pretrained(opt.join(mpath, model),
                                          config=b_config)

    return encoder
Beispiel #11
0
    def __init__(self, args, train_dataloader, test_dataloader=None):
        self.args = args

        cuda_condition = torch.cuda.is_available() and args.with_cuda

        self.device = torch.device("cuda" if cuda_condition else "cpu")
        print('Current cuda device ', torch.cuda.current_device())  # check

        if args.weight_load:
            config = AutoConfig.from_pretrained(args.pre_trained_model_path)
            model_state_dict = torch.load(
                os.path.join(args.pre_trained_model_path, 'pytorch_model.bin'))
            self.model = CXRBERT.from_pretrained(args.pre_trained_model_path,
                                                 state_dict=model_state_dict,
                                                 config=config,
                                                 args=args).to(self.device)
            print('training restart with mid epoch')
            print(config)
        else:
            if args.bert_model == "albert-base-v2":
                config = AlbertConfig.from_pretrained(args.bert_model)
            elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT":
                config = AutoConfig.from_pretrained(args.bert_model)
            elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12":
                config = AutoConfig.from_pretrained(args.bert_model)
            elif args.bert_model == "bert-small-scratch":
                config = BertConfig.from_pretrained(
                    "google/bert_uncased_L-4_H-512_A-8")
            elif args.bert_model == "bert-base-scratch":
                config = BertConfig.from_pretrained("bert-base-uncased")
            else:
                config = BertConfig.from_pretrained(
                    args.bert_model)  # bert-base, small, tiny

            self.model = CXRBERT(config, args).to(self.device)

        wandb.watch(self.model)

        if args.with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model,
                                         device_ids=args.cuda_devices)

        self.train_data = train_dataloader
        self.test_data = test_dataloader

        self.optimizer = AdamW(self.model.parameters(), lr=args.lr)

        self.mlm_criterion = nn.CrossEntropyLoss(ignore_index=-100)
        self.itm_criterion = nn.CrossEntropyLoss()

        self.log_freq = args.log_freq
        self.step_cnt = 0

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))
Beispiel #12
0
def load_pretrained(mpath,
                    config="albert_config.json",
                    model="albert_model.bin"):

    b_config = BC.from_pretrained(opt.join(mpath, config))
    encoder = AlbertModel.from_pretrained(opt.join(mpath, model),
                                          config=b_config)
    tokenizer = BertTokenizer.from_pretrained(mpath)

    return encoder, tokenizer
Beispiel #13
0
def load_albert(path):
    """
    加载模型
    """
    vocab_file = os.path.join(path, 'vocab.txt')
    tokenizer = BertTokenizer.from_pretrained(vocab_file)
    # print(tokenizer)
    config = AlbertConfig.from_pretrained(path)
    model = AlbertModel.from_pretrained(path, config=config)
    return model, tokenizer
Beispiel #14
0
 def init_model(self, model_name):
     if model_name == 'Bert':
         config = BertConfig.from_pretrained('bert-base-uncased')
         config.hidden_dropout_prob = 0.2
         config.attention_probs_dropout_prob = 0.2
         self.model = BertForMultipleChoice.from_pretrained(
             'pre_weights/bert-base-uncased_model.bin',
             config=config)
     elif model_name == 'Roberta':
         config = RobertaConfig.from_pretrained('roberta-large')
         config.hidden_dropout_prob = 0.2
         config.attention_probs_dropout_prob = 0.2
         self.model = RobertaForMultipleChoice.from_pretrained(
             'pre_weights/roberta-large_model.bin',
             config=config)
         # print('load csqa pretrain weights...')
         # self.model.load_state_dict(torch.load(
         #     'checkpoints/commonsenseQA_pretrain_temp.pth'
         # ))
     elif model_name == 'Albert':
         self.model = AlbertForMultipleChoice.from_pretrained(
             'pre_weights/albert-xxlarge_model.bin',
             config=AlbertConfig.from_pretrained('albert-xxlarge-v1'))
     elif model_name == 'RobertaLM':
         config = RobertaConfig.from_pretrained('roberta-large')
         config.hidden_dropout_prob = 0.2
         config.attention_probs_dropout_prob = 0.2
         self.model = RobertaForMultipleChoiceWithLM.from_pretrained(
             'pre_weights/roberta-large_model.bin',
             config=config)
     elif model_name == 'RobertaLM2':
         self.model = RobertaForMultipleChoiceWithLM2(self.tokenizer)
     elif 'GNN' in model_name:
         self.model = SOTA_goal_model(self.args)
     elif 'LM' in model_name:
         config = RobertaConfig.from_pretrained('roberta-large')
         config.hidden_dropout_prob = 0.2
         config.attention_probs_dropout_prob = 0.2
         self.model = RobertaForMultipleChoiceWithLM.from_pretrained(
             'pre_weights/roberta-large_model.bin',
             config=config)
     elif 'KBERT' in model_name:
         config = RobertaConfig.from_pretrained('roberta-large')
         config.hidden_dropout_prob = 0.2
         config.attention_probs_dropout_prob = 0.2
         self.model = RobertaForMultipleChoice.from_pretrained(
             'pre_weights/roberta-large_model.bin',
             config=config)
     else:
         pass
     self.model.to(self.args['device'])
     if torch.cuda.device_count() > 1 and self.args['use_multi_gpu']:
         print("{} GPUs are available. Let's use them.".format(
             torch.cuda.device_count()))
         self.model = torch.nn.DataParallel(self.model)
Beispiel #15
0
    def __init__(self, coordinator_args: CoordinatorArguments,
                 collab_optimizer_args: CollaborativeOptimizerArguments,
                 averager_args: AveragerArguments, dht: hivemind.DHT):
        self.save_checkpoint_step_interval = coordinator_args.save_checkpoint_step_interval
        self.repo_path = coordinator_args.repo_path
        self.upload_interval = coordinator_args.upload_interval
        self.previous_step = -1

        config = AlbertConfig.from_pretrained(
            coordinator_args.model_config_path)
        self.model = AlbertForPreTraining(config)

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        opt = Lamb(
            optimizer_grouped_parameters,
            lr=0.00176,
            weight_decay=0.01,
            clamp_value=10000.0,
            debias=True,
        )

        adjusted_target_batch_size = collab_optimizer_args.target_batch_size - collab_optimizer_args.batch_size_lead

        self.collaborative_optimizer = hivemind.CollaborativeOptimizer(
            opt=opt,
            dht=dht,
            prefix=experiment_prefix,
            compression_type=hivemind.utils.CompressionType.Value(
                collab_optimizer_args.compression),
            throughput=collab_optimizer_args.bandwidth,
            target_batch_size=adjusted_target_batch_size,
            client_mode=collab_optimizer_args.client_mode,
            verbose=True,
            start=True,
            **asdict(averager_args))
        self.previous_timestamp = time.time()
Beispiel #16
0
def init_model(cachedir='~/hashtag/', no_cuda=True):
    global tokenizer, model

    f_cachedir = os.path.expanduser(cachedir)
    bert_config = AlbertConfig.from_pretrained(f_cachedir)
    model = HashtagClassifier.from_pretrained(f_cachedir, config=bert_config)
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
    model.to(device)
    model.eval()

    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
Beispiel #17
0
def download_albert_base():
    file = '../input/albert-base-v2'

    config = AlbertConfig.from_pretrained('albert-base-v2')
    config.save_pretrained(file)
    
    model = AlbertModel.from_pretrained('albert-base-v2')
    model.save_pretrained(file)

    tkn = AlbertTokenizer.from_pretrained('albert-base-v2')
    tkn.save_pretrained(file)
Beispiel #18
0
    def build_pretrain_feature_model(self):
        mn = self.args.pretrain_feature_model_name
        if 'albert' in mn:
            pretrain_feature_tokenizer = BertTokenizer.from_pretrained(mn)
            config = AlbertConfig.from_pretrained(mn)
            config.output_hidden_states = True
            self.pretrain_feature_model = AlbertModel.from_pretrained(
                mn, config=config).to(self.device)
        else:
            pretrain_feature_tokenizer = AutoTokenizer.from_pretrained(mn)
            config = AutoConfig.from_pretrained(mn)
            config.output_hidden_states = True
            self.pretrain_feature_model = AutoModel.from_pretrained(
                mn, config=config).to(self.device)
        self.pretrain_feature_model.requires_grad_(False)
        # self.pretrain_feature_model.requires_grad_(True)
        # pipeline input is raw data, we have ids, so direct use model
        # self.pretrain_feature_pipeline = Pipeline('feature-extraction',
        #        model=self.pretrain_feature_model, tokenizer=pretrain_feature_tokenizer)

        # TODO: pre calc feature and save to file, it use less memory for train and faster
        # XXX: only used this tokenizer vocab, did not used for byte pair split, now just split by space
        utils.add_special_tokens_(self.pretrain_feature_model,
                                  pretrain_feature_tokenizer)
        # FIXME: this changed args should saved to checkpoint file
        if self.args.pretrain_feature_type == 'mem_n2n':
            self.args.emb_dim = self.pretrain_feature_model.config.hidden_size
            self.args.d_model = self.pretrain_feature_model.config.hidden_size
        elif self.args.pretrain_feature_type == 'feature':
            self.args.emb_dim = self.pretrain_feature_model.config.hidden_size
        else:
            if self.pretrain_feature_model.base_model_prefix != 'bert':
                self.args.emb_dim = self.pretrain_feature_model.config.embedding_size
            else:
                self.args.emb_dim = self.pretrain_feature_model.config.hidden_size

        # XXX: for 'xlnet'
        # self.args.d_model = self.pretrain_feature_model.config.hidden_size

        if 'weight' in self.args.pretrain_feature_type:
            # few effects
            self.args.d_model = self.pretrain_feature_model.config.hidden_size
            self.args.n_head = self.pretrain_feature_model.config.num_attention_heads
            self.args.d_ff = self.pretrain_feature_model.config.intermediate_size
            self.args.factor_ff = False

        self.vocab = datasets.ChatVocab(pretrain_feature_tokenizer)
        self.input_dim = len(self.vocab)
        self.pad_idx = self.vocab.stoi(utils.PAD)
        self.embeddings = None
        # too slow
        # self.tokenizer = pretrain_feature_tokenizer.tokenize
        self.tokenizer = None
    def __init__(self):
        super(AlbertTweetModel, self).__init__()
        config = AlbertConfig.from_pretrained(
            './albert.torch/albert-base-v2/config.json',
            output_hidden_states=True)
        self.bert = AlbertModel.from_pretrained(
            './albert.torch/albert-base-v2/pytorch_model.bin', config=config)

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.2)
        nn.init.normal_(self.fc.bias, 0)
Beispiel #20
0
    def __init__(self):
        super(Model, self).__init__()

        # 加载预训练模型
        self.config = AlbertConfig.from_pretrained(Config.config_bert_path)
        self.albert = AlbertModel.from_pretrained(Config.model_bert_path,
                                                  config=self.config)

        for param in self.albert.parameters():
            param.requires_grad = True

        self.qa_outputs = nn.Linear(1024, 2)
        self.loss_fct = CrossEntropyLoss()  # 计算损失
Beispiel #21
0
def model_setting(model_name):
    if model_name == 'bert':
        from transformers import AutoTokenizer, BertForSequenceClassification, BertConfig
        config = BertConfig.from_pretrained("bert-base-uncased", num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        return config, tokenizer, model

    elif model_name == 'albert':
        from transformers import AutoTokenizer, AlbertForSequenceClassification, AlbertConfig
        config = AlbertConfig.from_pretrained("albert-base-v2", num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
        model = AlbertForSequenceClassification.from_pretrained(
            "albert-base-v2")
        return config, tokenizer, model
Beispiel #22
0
def retrieve_conf(trained_condition, trained_vocab):
    albertconf = AlbertConfig.from_pretrained(
        f'albert-{trained_condition.albert_scale}-v2')
    if 'smaller' in trained_condition.keys():
        if trained_condition.smaller:  #originally used 4H for FFN but for memory issue, use 1H for FFN
            albertconf.hidden_size = trained_condition.hidden_size
            albertconf.num_hidden_layers = trained_condition.num_hidden_layers
            albertconf.num_attention_heads = trained_condition.num_attention_heads
            albertconf.intermediate_size = albertconf.hidden_size

    albertconf.vocab_size = len(trained_vocab.itos)
    albertconf.bos_token_id = trained_vocab.stoi['BOS']
    albertconf.eos_token_id = trained_vocab.stoi['EOS']
    albertconf.pad_token_id = trained_vocab.stoi['PAD']
    albertconf.max_position_embeddings = 40

    return albertconf
Beispiel #23
0
def load_model_and_tokenizer():
    unzipped_saved_model_dir = get_unzipped_dir_path(MODEL_ZIP_PATH,
                                                     UNZIPPED_MODEL_PATH)

    print("Loading pretrained ALBERT classification model")
    start = time.time()

    config = AlbertConfig.from_pretrained(unzipped_saved_model_dir,
                                          num_labels=NUM_LABELS,
                                          max_length=DEFAULT_MAX_LEN)
    model = TFAlbertForSequenceClassification.from_pretrained(
        unzipped_saved_model_dir, config=config)
    tokenizer = AlbertTokenizer.from_pretrained(unzipped_saved_model_dir,
                                                do_lower_case=True)

    duration = time.time() - start
    print(f"Initializing model took {duration}")

    return model, tokenizer
Beispiel #24
0
def get_bert_config(bert_model_type, output_hidden_states=False):
    if bert_model_type in [
            'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased',
            'bert-large-uncased', 'tune_bert-base-uncased_nsp',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        bert_config = BertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'roberta-base', 'prod-roberta-base-cased', 'roberta-large',
            'roberta-large-mnli', 'distilroberta-base'
    ]:
        bert_config = RobertaConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['xlnet-base-cased']:
        bert_config = XLNetConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        bert_config = AlbertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        bert_config = GPT2Config.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['transfo-xl']:
        bert_config = TransfoXLConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        bert_config = DistilBertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')

    bert_config.output_hidden_states = output_hidden_states
    return bert_config
Beispiel #25
0
    def load_model(self, config=None):
        """载入模型,返回载入后的模型组件

        Returns:
            [dict] -- [模型组件]
        """
        print("** loading model.. **")
        tokenizer = BertTokenizer.from_pretrained('../albert-small/',
                                                  cache_dir=None,
                                                  do_lower_case=True)
        bert_config = AlbertConfig.from_pretrained('../albert-small/')
        model = PairModel(config=bert_config)
        device = torch.device('cpu')
        state = torch.load(Path('../albert-small/pytorch_model.pt'),
                           map_location=device)
        model.load_state_dict(state['model'])
        model.to(device)
        model.eval()
        self.model = model
        self.tokenizer = tokenizer
Beispiel #26
0
    def __init__(self, *args, **kwargs):
        super(AlbertForComparison, self).__init__(self, args, kwargs)
        self.model_name = 'albert-base-v2'
        self.config = AlbertConfig.from_pretrained(self.model_name)
        self.config.output_hidden_states = False

        self.embedding_layer = TFAlbertModel.from_pretrained(
            self.model_name, config=self.config)
        self.pre_classifier = tf.keras.layers.Dense(
            self.config.hidden_size,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=self.config.initializer_range),
            activation="relu",
            name="pre_classifier",
        )
        self.classifier = tf.keras.layers.Dense(
            1,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=self.config.initializer_range),
            name="classifier")
Beispiel #27
0
 def get_model_and_tokenizer(cls, model_name):
     model = tokenizer = None
     if model_name == 'Bert':
         model = BertForMultipleChoice.from_pretrained(
             'pre_weights/bert-base-uncased_model.bin',
             config=BertConfig.from_pretrained('bert-base-uncased'))
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     elif model_name == 'Roberta':
         model = RobertaForMultipleChoice.from_pretrained(
             'pre_weights/roberta-large_model.bin',
             config=RobertaConfig.from_pretrained('roberta-large'))
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
     elif model_name == 'Albert':
         model = AlbertForMultipleChoice.from_pretrained(
             'pre_weights/albert-xxlarge_model.bin',
             config=AlbertConfig.from_pretrained('albert-xxlarge-v1'))
         tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v1')
     elif model_name == 'RobertaLM':
         model = RobertaForMultipleChoiceWithLM.from_pretrained(
             'pre_weights/roberta-large_model.bin',
             config=RobertaConfig.from_pretrained('roberta-large'))
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
     elif model_name == 'RobertaLM2':
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
         model = RobertaForMultipleChoiceWithLM2(tokenizer)
     elif 'GNN' in model_name:
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
         model = SOTA_goal_model(cls.args)
     elif 'LM' in model_name:
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
         model = RobertaForMultipleChoiceWithLM.from_pretrained(
             'pre_weights/roberta-large_model.bin',
             config=RobertaConfig.from_pretrained('roberta-large'))
     elif 'KBERT' in model_name:
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
         model = RobertaForMultipleChoice.from_pretrained(
             'pre_weights/roberta-large_model.bin',
             config=RobertaConfig.from_pretrained('roberta-large'))
     else:
         pass
     return model, tokenizer
Beispiel #28
0
    def build_pretrain_feature_model(self):
        mn = self.model_config.pretrain_feature_model_name
        if 'albert' in mn:
            pretrain_feature_tokenizer = BertTokenizer.from_pretrained(mn)
            config = AlbertConfig.from_pretrained(mn)
            config.output_hidden_states = True
            self.pretrain_feature_model = AlbertModel.from_pretrained(
                mn, config=config).to(self.device)
        else:
            pretrain_feature_tokenizer = AutoTokenizer.from_pretrained(mn)
            config = AutoConfig.from_pretrained(mn)
            config.output_hidden_states = True
            self.pretrain_feature_model = AutoModel.from_pretrained(
                mn, config=config).to(self.device)
        self.pretrain_feature_model.requires_grad_(False)
        # pipeline input is raw data, we have ids, so direct use model
        # self.pretrain_feature_pipeline = Pipeline('feature-extraction',
        #        model=self.pretrain_feature_model, tokenizer=pretrain_feature_tokenizer)

        # TODO: pre calc feature and save to file, it use less memory for train and faster
        # XXX: only used this tokenizer vocab, did not used for byte pair split, now just split by space
        utils.add_special_tokens_(self.pretrain_feature_model,
                                  pretrain_feature_tokenizer)
        # FIXME: this changed args should saved to checkpoint file
        # for use feature
        # self.args.emb_dim = self.pretrain_feature_model.config.hidden_size
        # self.model_config.emb_dim = self.pretrain_feature_model.config.hidden_size
        # for use emb
        self.args.emb_dim = self.pretrain_feature_model.config.embedding_size
        self.model_config.emb_dim = self.pretrain_feature_model.config.embedding_size

        self.vocab = datasets.ChatVocab(pretrain_feature_tokenizer)
        self.input_dim = len(self.vocab)
        self.pad_idx = self.vocab.stoi(utils.PAD)

        # pretrain_feature_model emb and weight no need anymore, use trained model
        self.pretrain_feature_model = None
        self.tokenizer = pretrain_feature_tokenizer.tokenize
Beispiel #29
0
def train(rank, args):

    #######################
    ## distributed

    if args.distributed_enabled:
        torch.distributed.init_process_group(
            backend='nccl',
            init_method='env://',
            world_size=args.distributed_world_size,
            rank=rank)
    if args.gpu_enabled:
        device = torch.device('cuda:{}'.format(rank))
    else:
        device = torch.device('cpu')

    is_master = True if not args.distributed_enabled else args.distributed_enabled and rank == 0

    #######################
    ## preamble

    set_gpus(rank)
    set_seed(rank)
    set_cuda(deterministic=args.gpu_deterministic)

    output_dir = f'{args.output_dir}/{rank}'
    os.makedirs(output_dir, exist_ok=False)

    setup_logging(filename=f'{output_dir}/output.log', console=is_master)

    #######################
    ## dataset

    tokenizer = new_tokenizer(vocab_file=args.data_vocab_file)
    vocab_size = len(tokenizer.vocab)
    ds_train = wrap_example_builder(
        dataset=load_owt(owt_dir=args.data_dir,
                         n_tensors_per_file=args.data_n_tensors_per_file),
        vocab=tokenizer.vocab,
        max_length=args.data_max_seq_length)

    pad_token_id = tokenizer.vocab['[PAD]']
    mask_token_id = tokenizer.vocab['[MASK]']
    cls_token_id = tokenizer.vocab['[CLS]']
    sep_token_id = tokenizer.vocab['[SEP]']

    def collate_batch(examples):
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [example['input_ids'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        input_mask = torch.nn.utils.rnn.pad_sequence(
            [example['input_mask'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        segment_ids = torch.nn.utils.rnn.pad_sequence(
            [example['segment_ids'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        return input_ids, input_mask, segment_ids

    def cycle(iterable):
        while True:
            for x in iterable:
                yield x

    ds_train_loader = iter(
        cycle(
            DataLoader(ds_train,
                       batch_size=args.opt_batch_size,
                       collate_fn=collate_batch)))

    #######################
    ## model

    def to_distributed_model(model):
        return model if not args.distributed_enabled else torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[rank], find_unused_parameters=True)

    def tie_weights(generator, discriminator):
        generator.electra.embeddings.word_embeddings = discriminator.electra.embeddings.word_embeddings
        generator.electra.embeddings.position_embeddings = discriminator.electra.embeddings.position_embeddings
        generator.electra.embeddings.token_type_embeddings = discriminator.electra.embeddings.token_type_embeddings

    class LogitsAdapter(torch.nn.Module):
        def __init__(self, adaptee):
            super().__init__()
            self.adaptee = adaptee

        def forward(self, *args, **kwargs):
            return self.adaptee(*args, **kwargs)[0]

    from transformers import AutoConfig, ElectraForMaskedLM, ElectraForPreTraining

    generator = ElectraForMaskedLM(
        AutoConfig.from_pretrained(args.model_generator))
    discriminator = AdaptedDiscriminator(
        AlbertConfig.from_pretrained(args.model_discriminator))

    tie_weights(generator, discriminator)

    model = to_distributed_model(
        Electra(LogitsAdapter(generator),
                LogitsAdapter(discriminator),
                num_tokens=vocab_size,
                mask_token_id=mask_token_id,
                pad_token_id=pad_token_id,
                mask_prob=args.model_mask_prob,
                mask_ignore_token_ids=[
                    tokenizer.vocab['[CLS]'], tokenizer.vocab['[SEP]']
                ],
                random_token_prob=0.0).to(device))

    #######################
    ## optimizer

    def get_linear_schedule_with_warmup(optimizer,
                                        num_warmup_steps,
                                        num_training_steps,
                                        last_epoch=-1):
        def lr_lambda(current_step):
            learning_rate = max(
                0.0, 1. - (float(current_step) / float(num_training_steps)))
            learning_rate *= min(1.0,
                                 float(current_step) / float(num_warmup_steps))
            return learning_rate

        return LambdaLR(optimizer, lr_lambda, last_epoch)

    def get_params_without_weight_decay_ln(named_params, weight_decay):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [
                    p for n, p in named_params
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                weight_decay,
            },
            {
                'params': [
                    p for n, p in named_params
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0,
            },
        ]
        return optimizer_grouped_parameters

    optimizer = torch.optim.AdamW(get_params_without_weight_decay_ln(
        model.named_parameters(), weight_decay=0.1),
                                  lr=args.opt_lr,
                                  betas=(0.9, 0.999),
                                  eps=1e-08)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.opt_warmup_steps,
        num_training_steps=args.opt_num_training_steps)
    scaler = torch.cuda.amp.GradScaler(enabled=args.gpu_mixed_precision)

    #######################
    ## train

    t, steps_s, eta_m = time(), 0., 0

    for step in range(args.opt_num_training_steps + 1):
        input_ids, input_mask, segment_ids = next(ds_train_loader)

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        assert input_ids.shape[1] <= args.data_max_seq_length

        optimizer.zero_grad()

        with torch.cuda.amp.autocast(enabled=args.gpu_mixed_precision):
            loss, loss_mlm, loss_disc, acc_gen, acc_disc, disc_labels, disc_pred = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=segment_ids)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        metrics = {
            'step': (step, '{:8d}'),
            'loss': (loss.item(), '{:8.5f}'),
            'loss_mlm': (loss_mlm.item(), '{:8.5f}'),
            'loss_disc': (loss_disc.item(), '{:8.5f}'),
            'acc_gen': (acc_gen.item(), '{:5.3f}'),
            'acc_disc': (acc_disc.item(), '{:5.3f}'),
            'lr': (scheduler.get_last_lr()[0], '{:8.7f}'),
            'steps': (steps_s, '{:4.1f}/s'),
            'eta': (eta_m, '{:4d}m'),
        }

        if step % args.step_log == 0:
            sep = ' ' * 2
            logger.info(
                sep.join([
                    f'{k}: {v[1].format(v[0])}' for (k, v) in metrics.items()
                ]))

        if step > 0 and step % 100 == 0:
            t2 = time()
            steps_s = 100. / (t2 - t)
            eta_m = int(((args.opt_num_training_steps - step) / steps_s) // 60)
            t = t2

        if step % 200 == 0:
            logger.info(
                np.array2string(disc_labels[0].cpu().numpy(),
                                threshold=sys.maxsize,
                                max_line_width=sys.maxsize))
            logger.info(
                np.array2string(disc_pred[0].cpu().numpy(),
                                threshold=sys.maxsize,
                                max_line_width=sys.maxsize))

        if step > 0 and step % args.step_ckpt == 0 and is_master:
            discriminator.electra.save_pretrained(
                f'{args.output_dir}/ckpt/{step}')
Beispiel #30
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")

    args = parser.parse_args()

    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.device = device

    seed = 5003
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    # prepare input
    import pickle
    with open('../1_1/distribution_dict1.pickle', 'rb') as f:
        distribution_dict1 = pickle.load(f)
    with open('../1_1/distribution_dict2.pickle', 'rb') as f:
        distribution_dict2 = pickle.load(f)
    with open('../1_1/distribution_dict3.pickle', 'rb') as f:
        distribution_dict3 = pickle.load(f)
    with open('../1_1/distribution_dict4.pickle', 'rb') as f:
        distribution_dict4 = pickle.load(f)

    json_dir = '../../input/simplified-nq-train.jsonl'
    max_data = 999999999

    id_list = []
    neg_id_list = []
    data_dict = {}
    neg_data_dict = {}
    with open(json_dir) as f:
        for n, line in tqdm(enumerate(f)):
            if n > max_data:
                break
            data = json.loads(line)

            is_pos = False
            annotations = data['annotations'][0]
            if annotations['yes_no_answer'] == 'YES':
                is_pos = True
            elif annotations['yes_no_answer'] == 'NO':
                is_pos = True
            elif annotations['short_answers']:
                is_pos = True
            elif annotations['long_answer']['candidate_index'] != -1:
                is_pos = True

            if is_pos and len(data['long_answer_candidates']) > 1:
                data_id = data['example_id']
                id_list.append(data_id)

                # random sampling
                if data_id in distribution_dict1:
                    candidate_index_list = np.array(
                        distribution_dict1[data_id]['candidate_index_list'])
                    prob_list = np.power(
                        np.array(distribution_dict1[data_id]['prob_list']), 1)
                elif data_id in distribution_dict2:
                    candidate_index_list = np.array(
                        distribution_dict2[data_id]['candidate_index_list'])
                    prob_list = np.power(
                        np.array(distribution_dict2[data_id]['prob_list']), 1)
                elif data_id in distribution_dict3:
                    candidate_index_list = np.array(
                        distribution_dict3[data_id]['candidate_index_list'])
                    prob_list = np.power(
                        np.array(distribution_dict3[data_id]['prob_list']), 1)
                else:
                    candidate_index_list = np.array(
                        distribution_dict4[data_id]['candidate_index_list'])
                    prob_list = np.power(
                        np.array(distribution_dict4[data_id]['prob_list']), 1)
                prob_list /= sum(prob_list)
                negative_candidate_index = random_sample_negative_candidates(
                    candidate_index_list, prob_list)

                #
                doc_words = data['document_text'].split()
                # negative
                candidate = data['long_answer_candidates'][
                    negative_candidate_index]
                negative_candidate_words = doc_words[
                    candidate['start_token']:candidate['end_token']]
                negative_candidate_start = candidate['start_token']
                negative_candidate_end = candidate['end_token']
                # positive
                candidate = data['long_answer_candidates'][
                    annotations['long_answer']['candidate_index']]
                positive_candidate_words = doc_words[
                    candidate['start_token']:candidate['end_token']]
                positive_candidate_start = candidate['start_token']
                positive_candidate_end = candidate['end_token']

                # initialize data_dict
                data_dict[data_id] = {
                    'question_text': data['question_text'],
                    'annotations': data['annotations'],
                    'positive_text': positive_candidate_words,
                    'positive_start': positive_candidate_start,
                    'positive_end': positive_candidate_end,
                    'negative_text': negative_candidate_words,
                    'negative_start': negative_candidate_start,
                    'negative_end': negative_candidate_end,
                }

            elif (not is_pos) and len(data['long_answer_candidates']) >= 1:
                data_id = data['example_id']
                neg_id_list.append(data_id)

                # random sampling
                if data_id in distribution_dict1:
                    candidate_index_list = np.array(
                        distribution_dict1[data_id]['candidate_index_list'])
                    prob_list = np.power(
                        np.array(distribution_dict1[data_id]['prob_list']), 1)
                elif data_id in distribution_dict2:
                    candidate_index_list = np.array(
                        distribution_dict2[data_id]['candidate_index_list'])
                    prob_list = np.power(
                        np.array(distribution_dict2[data_id]['prob_list']), 1)
                elif data_id in distribution_dict3:
                    candidate_index_list = np.array(
                        distribution_dict3[data_id]['candidate_index_list'])
                    prob_list = np.power(
                        np.array(distribution_dict3[data_id]['prob_list']), 1)
                else:
                    candidate_index_list = np.array(
                        distribution_dict4[data_id]['candidate_index_list'])
                    prob_list = np.power(
                        np.array(distribution_dict4[data_id]['prob_list']), 1)
                prob_list /= sum(prob_list)
                negative_candidate_index = random_sample_negative_candidates(
                    candidate_index_list, prob_list)

                #
                doc_words = data['document_text'].split()
                # negative
                candidate = data['long_answer_candidates'][
                    negative_candidate_index]
                negative_candidate_words = doc_words[
                    candidate['start_token']:candidate['end_token']]
                negative_candidate_start = candidate['start_token']
                negative_candidate_end = candidate['end_token']

                # initialize data_dict
                neg_data_dict[data_id] = {
                    'question_text': data['question_text'],
                    'negative_text': negative_candidate_words,
                    'negative_start': negative_candidate_start,
                    'negative_end': negative_candidate_end,
                }

    print(len(id_list), len(neg_id_list))
    random.shuffle(id_list)
    random.shuffle(
        neg_id_list
    )  # length of neg_id_list must be longer than id_list otherwise data generator will error

    # hyperparameters
    max_seq_len = 360
    max_question_len = 64
    learning_rate = 0.000004
    batch_size = 3
    ep = 0

    # build model
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model_path = 'model/'
    config = AlbertConfig.from_pretrained(model_path)
    config.num_labels = 5
    config.vocab_size = 30010
    tokenizer = AlbertTokenizer.from_pretrained(model_path, do_lower_case=True)
    #print(tokenizer.unk_token_id)
    model = AlbertForQuestionAnswering.from_pretrained('weights/epoch1/',
                                                       config=config)

    # add new tokens
    new_token_dict = {
        '<P>': 'qw1',
        '<Table>': 'qw2',
        '<Tr>': 'qw3',
        '<Ul>': 'qw4',
        '<Ol>': 'qw5',
        '<Fl>': 'qw6',
        '<Li>': 'qw7',
        '<Dd>': 'qw8',
        '<Dt>': 'qw9',
    }
    new_token_list = [
        'qw1',
        'qw2',
        'qw3',
        'qw4',
        'qw5',
        'qw6',
        'qw7',
        'qw8',
        'qw9',
        'qw99',
    ]

    num_added_toks = tokenizer.add_tokens(new_token_list)
    print('We have added', num_added_toks, 'tokens')
    model.resize_token_embeddings(len(tokenizer))

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level="O1",
                                      verbosity=0)
    model = torch.nn.parallel.DistributedDataParallel(
        model,
        device_ids=[args.local_rank],
        output_device=args.local_rank,
        find_unused_parameters=True)

    # training

    # iterator for training
    train_datagen = TFQADataset(id_list=id_list, neg_id_list=neg_id_list)
    train_sampler = DistributedSampler(train_datagen)
    train_collate = Collator(id_list=id_list,
                             neg_id_list=neg_id_list,
                             data_dict=data_dict,
                             neg_data_dict=neg_data_dict,
                             new_token_dict=new_token_dict,
                             tokenizer=tokenizer,
                             max_seq_len=max_seq_len,
                             max_question_len=max_question_len)
    train_generator = DataLoader(dataset=train_datagen,
                                 sampler=train_sampler,
                                 collate_fn=train_collate,
                                 batch_size=batch_size,
                                 num_workers=3,
                                 pin_memory=True)

    # train
    losses1 = AverageMeter()  # start
    losses2 = AverageMeter()  # end
    losses3 = AverageMeter()  # class
    accuracies1 = AverageMeter()  # start
    accuracies2 = AverageMeter()  # end
    accuracies3 = AverageMeter()  # class
    model.train()
    for j, (batch_input_ids, batch_attention_mask, batch_token_type_ids,
            batch_y_start, batch_y_end, batch_y) in enumerate(train_generator):
        batch_input_ids = batch_input_ids.cuda()
        batch_attention_mask = batch_attention_mask.cuda()
        batch_token_type_ids = batch_token_type_ids.cuda()
        labels1 = batch_y_start.cuda()
        labels2 = batch_y_end.cuda()
        labels3 = batch_y.cuda()

        logits1, logits2, logits3 = model(batch_input_ids,
                                          batch_attention_mask,
                                          batch_token_type_ids)
        y_true = (batch_y_start, batch_y_end, batch_y)
        loss1, loss2, loss3 = loss_fn((logits1, logits2, logits3),
                                      (labels1, labels2, labels3))
        loss = loss1 + loss2 + loss3
        acc1, n_position1 = get_position_accuracy(logits1, labels1)
        acc2, n_position2 = get_position_accuracy(logits2, labels2)
        acc3, n_position3 = get_position_accuracy(logits3, labels3)

        losses1.update(loss1.item(), n_position1)
        losses2.update(loss2.item(), n_position2)
        losses3.update(loss3.item(), n_position3)
        accuracies1.update(acc1, n_position1)
        accuracies2.update(acc2, n_position2)
        accuracies3.update(acc3, n_position2)

        optimizer.zero_grad()

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        optimizer.step()

    if args.local_rank == 0:
        print(
            'epoch: {}, train_loss1: {}, train_loss2: {}, train_loss3: {}, train_acc1: {}, train_acc2: {}, train_acc3: {}'
            .format(ep, losses1.avg, losses2.avg, losses3.avg, accuracies1.avg,
                    accuracies2.avg, accuracies3.avg),
            flush=True)

        out_dir = 'weights/epoch2/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        torch.save(model.module.state_dict(), out_dir + 'pytorch_model.bin')