Example #1
0
 def __init__(self, config):
     super(Model, self).__init__()
     model_config = XLNetConfig.from_pretrained(config.bert_path, num_labels=config.num_classes)
     self.xlnet = XLNetForSequenceClassification.from_pretrained(config.bert_path, config=model_config)
     for param in self.bert.parameters():
         param.requires_grad = True
     self.fc = nn.Linear(config.hidden_size, config.num_classes)
Example #2
0
    def __init__(self, config, dataset):
        super(XLNet, self).__init__(config, dataset)

        self.pretrained_model_path = config['pretrained_model_path']
        self.tokenizer = XLNetTokenizer.from_pretrained(
            self.pretrained_model_path,
            bos_token=dataset.sos_token,
            eos_token=dataset.eos_token,
            pad_token=dataset.padding_token)

        self.sos_token = self.tokenizer.bos_token
        self.eos_token = self.tokenizer.eos_token
        self.sos_token_idx = self.tokenizer.bos_token_id
        self.eos_token_idx = self.tokenizer.eos_token_id
        self.padding_token_idx = self.tokenizer.pad_token_id

        self.configuration = XLNetConfig.from_pretrained(
            self.pretrained_model_path,
            bos_token_id=self.sos_token_idx,
            eos_token_id=self.eos_token_idx,
            pad_token_id=self.padding_token_idx)

        self.decoder = XLNetLMHeadModel.from_pretrained(
            self.pretrained_model_path, config=self.configuration)
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
Example #3
0
    def __init__(self, config, dataset):
        super(XLNet, self).__init__(config, dataset)

        self.eval_generate_num = config['eval_generate_num']

        self.tokenizer = XLNetTokenizer.from_pretrained(
            'xlnet-base-cased',
            bos_token=dataset.sos_token,
            eos_token=dataset.eos_token,
            pad_token=dataset.padding_token,
            unk_token=dataset.eos_token)

        self.configuration = XLNetConfig.from_pretrained('xlnet-base-cased')

        self.decoder = XLNetLMHeadModel.from_pretrained(
            'xlnet-base-cased', config=self.configuration)
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.sos_token = dataset.sos_token
        self.eos_token = dataset.eos_token
        self.mask_token = '<mask>'
        self.padding_token_idx = self.tokenizer.pad_token_id
        self.max_seq_length = config['max_seq_length']
        self.device = config["device"]

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
Example #4
0
    def __init__(self, model_name, model_type):
        """
        Hyper-parameters found with validation set:
        xlnet-large-casd : epoch = 4,  learning_rate = 1E-5, batch_size = 16, epsilon = 1e-6
        bert-large-uncased : epoch = 4,  learning_rate = 3E-5, batch_size = 16, epsilon = 1e-8
        ALBERT xxlarge-v2 large : epoch = 3,  learning_rate = 5E-5, batch_size = 8, epsilon = 1e-6 to be improved...
        """
        self.model_name = model_name
        self.model_type = model_type

        # Cf transformers library, batch of 16 or 32 is advised for training. For memory issues, we will take 16. Gradient accumulation step has not lead
        # to great improvment and therefore won't be used here.
        if model_type == 'albert':
            self.batch_size = 8
        else:
            self.batch_size = 16

        available_model_name = ["xlnet-large-cased", "bert-large-uncased", "albert-xlarge-v2"]
        available_model_type = ["bert", "xlnet", "albert"]

        if self.model_name not in available_model_name:
            raise Exception("Error : model_name should be in", available_model_name)
        if self.model_type not in available_model_type:
            raise Exception("Error : model_name should be in", available_model_type)

        # Load BertForSequenceClassification, the pretrained BERT model with a single linear regression layer on top of the pooled output
        # Load our fined tune model: ex: BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
        if self.model_type == 'bert':
            self.config = BertConfig.from_pretrained(self.model_name, num_labels=1)  # num_labels=1 for regression task
            self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        elif self.model_type == 'xlnet':
            self.config = XLNetConfig.from_pretrained(self.model_name, num_labels=1)
            self.model = XLNetForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        elif self.model_type == 'albert':
            self.config = AlbertConfig.from_pretrained(self.model_name, num_labels=1)
            self.model = AlbertForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        self.model.cuda()

        if self.model_name == 'xlnet-large-cased':
            self.epochs = 4
            self.lr = 1e-5
            self.eps = 1e-6

        elif self.model_name == 'bert-large-uncased':
            self.epochs = 4
            self.lr = 3e-5
            self.eps = 1e-8

        elif self.model_name == 'albert-xxlarge-v2':
            self.epochs = 3
            self.lr = 5e-5
            self.eps = 1e-6

        self.max_grad_norm = 1.0  # Gradient threshold, gradients norms that exceed this threshold are scaled down to match the norm.

        self.optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=self.eps)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.n_gpu = torch.cuda.device_count()
        torch.cuda.get_device_name(0)
Example #5
0
    def __init__(
        self,
        language=Language.ENGLISHCASED,
        num_labels=5,
        cache_dir=".",
        num_gpus=None,
        num_epochs=1,
        batch_size=8,
        lr=5e-5,
        adam_eps=1e-8,
        warmup_steps=0,
        weight_decay=0.0,
        max_grad_norm=1.0,
    ):
        """Initializes the classifier and the underlying pretrained model.

        Args:
            language (Language, optional): The pretrained model's language.
                                           Defaults to 'xlnet-base-cased'.
            num_labels (int, optional): The number of unique labels in the
                training data. Defaults to 5.
            cache_dir (str, optional): Location of XLNet's cache directory.
                Defaults to ".".
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 8.
            lr (float): Learning rate of the Adam optimizer. Defaults to 5e-5.
            adam_eps (float, optional): term added to the denominator to improve
                                        numerical stability. Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps in which to increase
                                        learning rate linearly from 0 to 1. Defaults to 0.
            weight_decay (float, optional): Weight decay. Defaults to 0.
            max_grad_norm (float, optional): Maximum norm for the gradients. Defaults to 1.0
        """

        if num_labels < 2:
            raise ValueError("Number of labels should be at least 2.")

        self.language = language
        self.num_labels = num_labels
        self.cache_dir = cache_dir

        self.num_gpus = num_gpus
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.lr = lr
        self.adam_eps = adam_eps
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.max_grad_norm = max_grad_norm

        # create classifier
        self.config = XLNetConfig.from_pretrained(
            self.language.value, num_labels=num_labels, cache_dir=cache_dir
        )
        self.model = XLNetForSequenceClassification(self.config)
Example #6
0
def get_bert_model(model_name=config.MODEL_NAME):
    # a. 通过词典导入分词器
    tokenizer = XLNetTokenizer.from_pretrained(model_name)
    # b. 导入配置文件
    model_config = XLNetConfig.from_pretrained(model_name)
    # 修改配置

    return tokenizer, model_config
def main(train_epoch, batch_size, seq_length, lr, corpus_path, vocab_path,
         config_path, pretrain_model_path, output_record_path,
         model_save_path):
    seed_everything(997)
    num_train_epochs = train_epoch
    pretrain_batch_size = batch_size
    seq_length = seq_length
    lr = lr
    corpus_path = corpus_path
    vocab_path = vocab_path
    config_path = config_path
    pretrain_model_path = pretrain_model_path
    output_record_path = output_record_path
    model_save_path = model_save_path

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    #     train_dataset = LineByLineTextDataset(block_size=128, file_path=corpus_path, tokenizer=tokenizer)

    #     data = read_data(corpus_path, tokenizer)
    train_dataset = OppoDataset(train_file_path=corpus_path,
                                tokenizer=tokenizer,
                                maxlen=128)

    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer)

    config = XLNetConfig.from_pretrained(
        pretrained_model_name_or_path=config_path)
    #     model = XLNetForMaskedLM(config=config,name='./xlnet_model/pytorch_model.bin')
    if os.path.exists(pretrain_model_path):
        model = XLNetLMHeadModel.from_pretrained(pretrain_model_path,
                                                 config=config)
    else:
        model = XLNetLMHeadModel(config=config)


#     data_collator = Collator(max_seq_len=seq_length, tokenizer=tokenizer, mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir=output_record_path,
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        learning_rate=lr,
        dataloader_num_workers=8,
        prediction_loss_only=True,
        fp16=True,
        fp16_backend='amp',
        per_device_train_batch_size=pretrain_batch_size,
        save_strategy='no',
        seed=997)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset)

    trainer.train()
    trainer.save_model(model_save_path)
Example #8
0
def main():
    # コマンドライン引数の取得(このファイル上部のドキュメントから自動生成)
    args = docopt(__doc__)
    pprint(args)

    # パラメータの取得
    lr = float(args['--lr'])
    seq_len = int(args['--seq_len'])
    max_epoch = int(args['--max_epoch'])
    batch_size = int(args['--batch_size'])
    num_train = int(args['--num_train'])
    num_valid = int(args['--num_valid'])

    # モデルの選択
    pretrained_weights = 'xlnet-base-cased'
    tokenizer = XLNetTokenizer.from_pretrained(pretrained_weights)
    config = XLNetConfig.from_pretrained(pretrained_weights, num_labels=4)
    model = XLNetForSequenceClassification.from_pretrained(pretrained_weights)
    print(model.config.num_labels)

    # 使用デバイスの取得
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # データの読み込みとデータセットの作成
    encoder = TwinPhraseEncoder(tokenizer, seq_len)

    train_dataset = WordnetDataset(mode='train',
                                   num_data=num_train,
                                   transform=encoder)
    valid_dataset = WordnetDataset(mode='valid',
                                   num_data=num_valid,
                                   transform=encoder)
    train_loader = data.DataLoader(train_dataset, batch_size, shuffle=True)
    valid_loader = data.DataLoader(valid_dataset, batch_size, shuffle=True)

    # 最適化法の定義
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # 学習
    for epoch in range(1, max_epoch + 1):
        print('=' * 27 + f' Epoch {epoch:0>2} ' + '=' * 27)
        # Training
        loss, accu = train_model(model, optimizer, train_loader, device)
        print(
            f'|  Training    |  loss-avg : {loss:>8.6f}  |  accuracy : {accu:>8.3%}  |'
        )
        # Validation
        loss, accu = valid_model(model, optimizer, valid_loader, device)
        print(
            f'|  Validation  |  loss-avg : {loss:>8.6f}  |  accuracy : {accu:>8.3%}  |'
        )
        # 保存
        torch.save(model.state_dict(), f'../result/{pretrained_weights}.pkl')
Example #9
0
    def init_model(self):
        basic_encoder = None
        if self.config['use_bert']:
            bert_config = BertConfig.from_pretrained(self.config['bert_model_name'], cache_dir=self.config['bert_dir'])
            if self.config['num_bert_layer'] is not None:
                bert_config.num_hidden_layers = self.config['num_bert_layer']
            bert = BertModel.from_pretrained(self.config['bert_model_name'], cache_dir=self.config['bert_dir'], config=bert_config)
            basic_encoder = bert
        elif self.config['use_xlnet']:
            xlnet_config = XLNetConfig.from_pretrained('hfl/chinese-xlnet-base', cache_dir=self.config['xlnet_dir'])
            xlnet_config.n_layer = self.config['num_xlnet_layer']
            xlnet_config.mem_len = self.config['xlnet_mem_len']
            xlnet = XLNetModel.from_pretrained('hfl/chinese-xlnet-base', cache_dir=self.config['xlnet_dir'], config=xlnet_config)
            basic_encoder = xlnet
        elif self.config['use_transformer']:
            bert_config = BertConfig.from_pretrained('bert-base-chinese', cache_dir=self.config['bert_dir'])
            if self.config['num_transformer_layer'] is not None:
                bert_config.num_hidden_layers = self.config['num_transformer_layer']
            transf = BertModel(bert_config)
            basic_encoder = transf
        elif self.config['use_rnn_basic_encoder']:
            pass
        else:
            raise Exception('Not support other basic encoder')

        self.model = DocEE(self.config, basic_encoder, self.tokenizer)
        if self.config['cuda']:
            self.model.cuda()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config['learning_rate'])

        if self.config['resume_model']:
            OUTPUT_DIR = self.config['output_dir']
            MODEL_SAVE_DIR = os.path.join(OUTPUT_DIR, self.config['model_save_dir'])
            if os.path.exists(MODEL_SAVE_DIR):
                cpt_file_names = os.listdir(MODEL_SAVE_DIR)
                if len(cpt_file_names) > 0:
                    epoch_record = []
                    for cpt_file_name in cpt_file_names:
                        epoch_record.append(int(cpt_file_name.split('-')[-1].split('.')[0]))
                    epoch_record.sort()
                    latest_epoch = epoch_record[-1]
                    self.latest_epoch = latest_epoch + 1

                    latest_model_file_name = os.path.join(MODEL_SAVE_DIR, self.config['model_file'] % (self.config['ee_method'], latest_epoch))
                    if self.config['cuda']:
                        store_dict = torch.load(latest_model_file_name, map_location=torch.device('cuda'))
                    else:
                        store_dict = torch.load(latest_model_file_name, map_location='cpu')
                    self.model.load_state_dict(store_dict['model_state'])
                    self.optimizer.load_state_dict(store_dict['optimizer_state'])
                    print('resume train from %s' % latest_model_file_name)
        print('model init finish')
    def __init__(self, config, x_embed):
        super().__init__()

        # pretrained_weights = "xlnet-base-cased"
        self.output_attentions = config.output_attentions
        self.model = XLNetModel.from_pretrained(
            config.pretrained_weights,
            output_attentions=self.output_attentions)
        self.pretrained_config = XLNetConfig.from_pretrained(
            config.pretrained_weights)
        self.encoder_out_size = self.model.config.d_model

        return
Example #11
0
def make_model(args, device):
    if args.model == "roberta":
        config = RobertaConfig.from_pretrained("roberta-base")
        config.num_labels = 5
        if args.dataset == "imdb":
            config.num_labels = 2
        if args.dataset == "ag_news":
            config.num_labels = 4
        if args.dataset == "yahoo":
            config.num_labels = 10
        pretrained_model = RobertaForSequenceClassification.from_pretrained(
            "roberta-base", config=config)
        return scl_model_Roberta(config,
                                 device,
                                 pretrained_model,
                                 with_semi=args.with_mix,
                                 with_sum=args.with_summary)

    if args.model == "bert":
        config = BertConfig.from_pretrained("bert-base-uncased")
        config.num_labels = 5
        if args.dataset == "imdb":
            config.num_labels = 2
        if args.dataset == "ag_news":
            config.num_labels = 4
        if args.dataset == "yahoo":
            config.num_labels = 10
        pretrained_model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", config=config)
        return scl_model_Bert(config,
                              device,
                              pretrained_model,
                              with_semi=args.with_mix,
                              with_sum=args.with_summary)

    if args.model == "xlnet":
        config = XLNetConfig.from_pretrained("xlnet-base-cased")
        config.num_labels = 5
        if args.dataset == "imdb":
            config.num_labels = 2
        if args.dataset == "ag_news":
            config.num_labels = 4
        if args.dataset == "yahoo":
            config.num_labels = 10
        pretrained_model = XLNetForSequenceClassification.from_pretrained(
            "xlnet-base-cased", config=config)
        return scl_model_Xlnet(config,
                               device,
                               pretrained_model,
                               with_semi=args.with_mix,
                               with_sum=args.with_summary)
Example #12
0
def get_xlnet():
    ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='ids')
    att = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='att')
    tok_type_ids = keras.layers.Input(shape=(None, ),
                                      dtype=tf.int32,
                                      name='tti')

    config = XLNetConfig.from_pretrained(Config.XLNet.config)
    xlnet_model = TFXLNetModel.from_pretrained(Config.XLNet.model,
                                               config=config)

    x = xlnet_model(ids, attention_mask=att, token_type_ids=tok_type_ids)

    x1 = keras.layers.Dropout(0.15)(x[0])
    x1 = keras.layers.Conv1D(768, 2, padding='same')(x1)
    x1 = keras.layers.LeakyReLU()(x1)
    x1 = keras.layers.LayerNormalization()(x1)
    x1 = keras.layers.Conv1D(64, 2, padding='same')(x1)
    x1 = keras.layers.LeakyReLU()(x1)
    x1 = keras.layers.LayerNormalization()(x1)
    x1 = keras.layers.Conv1D(32, 2, padding='same')(x1)
    x1 = keras.layers.Conv1D(1, 1)(x1)
    x1 = keras.layers.Flatten()(x1)
    x1 = keras.layers.Activation('softmax', dtype='float32', name='sts')(x1)

    x2 = keras.layers.Dropout(0.15)(x[0])
    x2 = keras.layers.Conv1D(768, 2, padding='same')(x2)
    x2 = keras.layers.LeakyReLU()(x2)
    x2 = keras.layers.LayerNormalization()(x2)
    x2 = keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = keras.layers.LeakyReLU()(x2)
    x2 = keras.layers.LayerNormalization()(x2)
    x2 = keras.layers.Conv1D(32, 2, padding='same')(x2)
    x2 = keras.layers.Conv1D(1, 1)(x2)
    x2 = keras.layers.Flatten()(x2)
    x2 = keras.layers.Activation('softmax', dtype='float32', name='ets')(x2)

    model = keras.models.Model(inputs=[ids, att, tok_type_ids],
                               outputs=[x1, x2])

    optimizer = keras.optimizers.Adam(learning_rate=6e-5)
    if Config.Train.use_amp:
        optimizer = keras.mixed_precision.experimental.LossScaleOptimizer(
            optimizer, 'dynamic')
    loss = keras.losses.CategoricalCrossentropy(
        label_smoothing=Config.Train.label_smoothing)
    model.compile(loss=loss, optimizer=optimizer)

    return model
Example #13
0
def get_model(model_dir, saved_dir, n_labels, task, model_type='xlnet-base-cased'):
    config = XLNetConfig.from_pretrained(
            model_type,
            num_labels=n_labels,
            finetuning_task=task,
            )
    model = XLNetModel(config=config)
    model.load_state_dict(torch.load(saved_dir+'/pytorch_model.bin'))
    model.eval() # set dropout and batch normalisation layers to evaluation
    tokeniser = XLNetTokenizer.from_pretrained(
            model_dir,
            config=config,
            from_tf=True,
            )
    return model, tokeniser
Example #14
0
def demo5():
    from transformers import XLNetConfig, XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
    import torch
    # 定义路径,初始化tokenizer
    XLN_PATH = r"D:\transformr_files\XLNetLMHeadModel"
    tokenizer = XLNetTokenizer.from_pretrained(XLN_PATH)
    # 加载配置
    model_config = XLNetConfig.from_pretrained(XLN_PATH)
    # 设定类别数为3
    model_config.num_labels = 3
    # 直接从xlnet的config新建XLNetForSequenceClassification(和上一节方法等效)
    cls_model = XLNetForSequenceClassification.from_pretrained(
        XLN_PATH, config=model_config)
    # 设定模式
    model.eval()
    token_codes = tokenizer.encode_plus("i like you, what about you")
    def __init__(self, config, x_embed):
        super().__init__()

        # pretrained_weights = "xlnet-base-cased"
        self.model = XLNetModel.from_pretrained(config.pretrained_weights)
        self.pretrained_config = XLNetConfig.from_pretrained(config.pretrained_weights)

        # if config.use_gpu:
        #   self.model = self.model.to(device=torch.device("cuda"))
        # if config.use_parallel:
        #   self.model = torch.nn.DataParallel(self.model)

        # self.encoder_out_size = 768
        self.encoder_out_size = self.model.config.d_model

        return
Example #16
0
def get_bert(bert_name):
    if 'roberta' in bert_name:
        print('load roberta-base')
        model_config = RobertaConfig.from_pretrained('roberta-base')
        model_config.output_hidden_states = True
        bert = RobertaModel.from_pretrained('roberta-base', config=model_config)
    elif 'xlnet' in bert_name:
        print('load xlnet-base-cased')
        model_config = XLNetConfig.from_pretrained('xlnet-base-cased')
        model_config.output_hidden_states = True
        bert = XLNetModel.from_pretrained('xlnet-base-cased', config=model_config)
    else:
        print('load bert-base-uncased')
        model_config = BertConfig.from_pretrained('bert-base-uncased')
        model_config.output_hidden_states = True
        bert = BertModel.from_pretrained('bert-base-uncased', config=model_config)
    return bert
Example #17
0
def get_bert_config(bert_model_type, output_hidden_states=False):
    if bert_model_type in [
            'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased',
            'bert-large-uncased', 'tune_bert-base-uncased_nsp',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        bert_config = BertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'roberta-base', 'prod-roberta-base-cased', 'roberta-large',
            'roberta-large-mnli', 'distilroberta-base'
    ]:
        bert_config = RobertaConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['xlnet-base-cased']:
        bert_config = XLNetConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        bert_config = AlbertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        bert_config = GPT2Config.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['transfo-xl']:
        bert_config = TransfoXLConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        bert_config = DistilBertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')

    bert_config.output_hidden_states = output_hidden_states
    return bert_config
Example #18
0
    def __init__(self, model_name='xlnet'):
        self._model_name = model_name
        self._device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # default parameter
        self._max_seq_length = 384
        self._doc_stride = 128
        self._max_query_length = 64

        MODEL_CLASSES = {
            "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer)
        }
        # self._config, self._model_class, self._tokenizer = MODEL_CLASSES[model_name]
        self._tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                                         do_lower_case=True)
        self._config = XLNetConfig.from_pretrained('xlnet-base-cased',
                                                   do_lower_case=True)
        # this class wraps required setting for transform learning
        self._model = XLNetForQuestionAnswering.from_pretrained(
            'xlnet-base-cased', config=self._config)
Example #19
0
    def __init__(self, model_name, drop_prob=dropout_prob):
        super(transformer_model, self).__init__()

        # It is used to instantiate a XLNet model according to the specified arguments, defining the model architecture
        configuration = XLNetConfig.from_pretrained(model_name,
                                                    output_hidden_states=True)
        self.xlnet = XLNetModel.from_pretrained(model_name,
                                                config=configuration)

        # freezes layers of the model
        if to_freeze:
            cnt = 0
            for child in xlnet.xlnet.children():
                cnt = cnt + 1
                if cnt <= freeze_layers:
                    for param in child.parameters():
                        param.requires_grad = False

        self.fc1 = nn.Linear(xlnet_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, final_size)
        self.dropout = nn.Dropout(p=drop_prob)
Example #20
0
def save_model(model_dir, saved_dir, n_labels, task, model_type='xlnet-base-cased'):
    """
    save checkpoints as pytorch model: model.ckpt.index
                                       model.ckpt.data-00000-of-00001
    """
    config = XLNetConfig.from_pretrained(
            model_type,
            num_labels=n_labels,
            finetuning_task=task,
            )
    model = XLNetModel.from_pretrained(
            model_dir,
            config=config,
            from_tf=True,
            )
    tokeniser = XLNetTokenizer.from_pretrained(
            model_dir,
            config=config,
            from_tf=True,
            )
    model.save_pretrained(saved_dir)
    # print model params
    for param in model.state_dict():
        print(param, "\t", model.state_dict()[param].size())
Example #21
0
# -*- coding: utf-8 -*-
"""
@Time : 2020/11/13 15:04
@Auth : xiaolu
@File :test.py
@IDE :PyCharm
@Email:[email protected]
"""
import torch
from pdb import set_trace
from transformers import XLNetConfig, XLNetModel
from transformers import XLNetTokenizer

if __name__ == '__main__':
    tokenizer = XLNetTokenizer.from_pretrained('./xlnet_pretrain/spiece.model')
    config = XLNetConfig.from_pretrained('./xlnet_pretrain/config.json')
    model = XLNetModel.from_pretrained('./xlnet_pretrain/pytorch_model.bin',
                                       config=config)

    text = '你是我患得患失的梦' * 500
    text = tokenizer.tokenize(text)
    input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(text)).view(1, -1)
    print(input_ids.size())
    output = model(input_ids)
    set_trace()
def main():

    # Set device for PyTorch
    if torch.cuda.is_available():
        # might need to update when using more than 1 GPU
        rank = 0
        torch.cuda.set_device(rank)
        device = torch.device("cuda", rank)
        #torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cpu")
        n_gpu = 0

    print("N GPU: ", n_gpu)

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--feature_save_dir",
        type=str,
        help=
        "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. "
    )

    parser.add_argument("--set_type",
                        type=str,
                        help="Specify train/test file.")

    args = parser.parse_args()

    # Load training data
    feature_save_path = os.path.join(
        '/gpfs/data/razavianlab/capstone19/preprocessed_data/',
        args.feature_save_dir)
    logger.info("Loading {} dataset".format(args.set_type))
    dataloader = load_featurized_examples(batch_size=32,
                                          set_type=args.set_type,
                                          feature_save_path=feature_save_path)

    # Load saved model
    config = XLNetConfig.from_pretrained('xlnet-base-cased', num_labels=2292)
    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',
                                                           config=config)
    model.to(device)
    model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu)))

    summaries = torch.empty(0, config.d_model).to(device)
    labels = torch.empty(0, config.num_labels).to(device)
    for i, batch in enumerate(dataloader):
        model.eval()
        with torch.no_grad():
            input_ids, input_mask, segment_ids, label_ids = batch

            input_ids = input_ids.to(device).long()
            input_mask = input_mask.to(device).long()
            segment_ids = segment_ids.to(device).long()
            label_ids = label_ids.to(device).float()
            transformer_outputs = model.module.transformer(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                input_mask=input_mask)

            output = transformer_outputs[0]
            # extracting the CLS token
            summary = output[:, 0]
            summary = summary.to(device)

            summaries = torch.cat([summaries, summary], dim=0)
            labels = torch.cat([labels, label_ids])

            if i % 1000 == 0 and i > 0:
                logger.info("Embedded and summarized batch {} of {}".format(
                    i, len(dataloader)))

        # Save the embedded representations of the document every 50,000 batches to save memory
        if i % 12000 == 0 and i > 0:
            logger.info("Saving summaries...")
            torch.save(
                summaries,
                os.path.join(
                    feature_save_path,
                    args.set_type + '_summaries_{}.pt'.format(int(i / 12000))))
            torch.save(
                labels,
                os.path.join(
                    feature_save_path,
                    args.set_type + '_label_ids_{}.pt'.format(int(i / 12000))))
            summaries = torch.empty(0, config.d_model).to(device)
            labels = torch.empty(0, config.num_labels).to(device)

    # Save any remaining embedded representations
    if i % 12000 != 0:
        logger.info("Saving summaries...")
        torch.save(
            summaries,
            os.path.join(
                feature_save_path, args.set_type +
                '_summaries_{}.pt'.format(int(math.ceil(i / 12000)))))
        torch.save(
            labels,
            os.path.join(
                feature_save_path, args.set_type +
                '_label_ids_{}.pt'.format(int(math.ceil(i / 12000)))))

    return
Example #23
0
 def __init__(self):
     super(XlnetModelTest, self).__init__()
     config = XLNetConfig.from_pretrained('Saier/models/config.json')
     self.xlnet = XLNetForSequenceClassification(config)  # /bert_pretrain/
     self.device = torch.device("cuda")
Example #24
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files, field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = XLNetConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = XLNetTokenizerFast.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = XLNetForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    # Training preprocessing
    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")
        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []
        tokenized_examples["is_impossible"] = []
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
            # The cls token gets 1.0 too (for predictions of empty answers).
            tokenized_examples["p_mask"].append(
                [
                    0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
                    for k, s in enumerate(sequence_ids)
                ]
            )

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                tokenized_examples["is_impossible"].append(1.0)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != context_idx:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != context_idx:
                    token_end_index -= 1
                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                    tokenized_examples["is_impossible"].append(1.0)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)
                    tokenized_examples["is_impossible"].append(0.0)

        return tokenized_examples

    if training_args.do_train:
        train_dataset = datasets["train"].map(
            prepare_train_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
            # Find the CLS token in the input ids.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
            tokenized_examples["p_mask"].append(
                [
                    0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
                    for k, s in enumerate(sequence_ids)
                ]
            )

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_idx else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if training_args.do_eval:
        validation_dataset = datasets["validation"].map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
    # collator.
    data_collator = default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding(tokenizer)

    # Post-processing:
    def post_processing_function(examples, features, predictions):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=data_args.version_2_with_negative,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            start_n_top=model.config.start_n_top,
            end_n_top=model.config.end_n_top,
            output_dir=training_args.output_dir,
            is_world_process_zero=trainer.is_world_process_zero(),
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
            formatted_predictions = [
                {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]}
                for k, v in predictions.items()
            ]
        else:
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"]]
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    # TODO: Once the fix lands in a Datasets release, remove the _local here and the squad_v2_local folder.
    current_dir = os.path.sep.join(os.path.join(__file__).split(os.path.sep)[:-1])
    metric = load_metric(os.path.join(current_dir, "squad_v2_local") if data_args.version_2_with_negative else "squad")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions, references=p.label_ids)

    # Initialize our Trainer
    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=validation_dataset if training_args.do_eval else None,
        eval_examples=datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            if args.hub_model_id is None:
                repo_name = get_full_repo_name(Path(args.output_dir).name,
                                               token=args.hub_token)
            else:
                repo_name = args.hub_model_id
            repo = Repository(args.output_dir, clone_from=repo_name)
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        if args.test_file is not None:
            data_files["test"] = args.test_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = XLNetConfig.from_pretrained(args.model_name_or_path)
    tokenizer = XLNetTokenizerFast.from_pretrained(args.model_name_or_path)
    model = XLNetForQuestionAnswering.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config)

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    column_names = raw_datasets["train"].column_names

    question_column_name = "question" if "question" in column_names else column_names[
        0]
    context_column_name = "context" if "context" in column_names else column_names[
        1]
    answer_column_name = "answers" if "answers" in column_names else column_names[
        2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )

    max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)

    # Training preprocessing
    def prepare_train_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [
            q.lstrip() for q in examples[question_column_name]
        ]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")
        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []
        tokenized_examples["is_impossible"] = []
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
            # The cls token gets 1.0 too (for predictions of empty answers).
            tokenized_examples["p_mask"].append([
                0.0 if (not special_tokens[i][k] and s == context_idx)
                or k == cls_index else 1.0 for k, s in enumerate(sequence_ids)
            ])

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                tokenized_examples["is_impossible"].append(1.0)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != context_idx:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != context_idx:
                    token_end_index -= 1
                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                    tokenized_examples["is_impossible"].append(1.0)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(
                        token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(
                        token_end_index + 1)
                    tokenized_examples["is_impossible"].append(0.0)

        return tokenized_examples

    if "train" not in raw_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = raw_datasets["train"]
    if args.max_train_samples is not None:
        # We will select sample from whole data if agument is specified
        train_dataset = train_dataset.select(range(args.max_train_samples))
    # Create train feature from dataset
    with accelerator.main_process_first():
        train_dataset = train_dataset.map(
            prepare_train_features,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on train dataset",
        )
    if args.max_train_samples is not None:
        # Number of samples might increase during Feature Creation, We select only specified max samples
        train_dataset = train_dataset.select(range(args.max_train_samples))

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [
            q.lstrip() for q in examples[question_column_name]
        ]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
            # Find the CLS token in the input ids.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
            tokenized_examples["p_mask"].append([
                0.0 if (not special_tokens[i][k] and s == context_idx)
                or k == cls_index else 1.0 for k, s in enumerate(sequence_ids)
            ])

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(
                examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_idx else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if "validation" not in raw_datasets:
        raise ValueError("--do_eval requires a validation dataset")
    eval_examples = raw_datasets["validation"]
    if args.max_eval_samples is not None:
        # We will select sample from whole data
        eval_examples = eval_examples.select(range(args.max_eval_samples))
    # Validation Feature Creation
    with accelerator.main_process_first():
        eval_dataset = eval_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on validation dataset",
        )

    if args.max_eval_samples is not None:
        # During Feature creation dataset samples might increase, we will select required samples again
        eval_dataset = eval_dataset.select(range(args.max_eval_samples))

    if args.do_predict:
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_examples = raw_datasets["test"]
        if args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(
                range(args.max_predict_samples))
        # Predict Feature Creation
        with accelerator.main_process_first():
            predict_dataset = predict_examples.map(
                prepare_validation_features,
                batched=True,
                num_proc=args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )
            if args.max_predict_samples is not None:
                # During Feature creation dataset samples might increase, we will select required samples again
                predict_dataset = predict_dataset.select(
                    range(args.max_predict_samples))

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorWithPadding(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)

    eval_dataset_for_model = eval_dataset.remove_columns(
        ["example_id", "offset_mapping"])
    eval_dataloader = DataLoader(eval_dataset_for_model,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    if args.do_predict:
        predict_dataset_for_model = predict_dataset.remove_columns(
            ["example_id", "offset_mapping"])
        predict_dataloader = DataLoader(
            predict_dataset_for_model,
            collate_fn=data_collator,
            batch_size=args.per_device_eval_batch_size)

    # Post-processing:
    def post_processing_function(examples,
                                 features,
                                 predictions,
                                 stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=args.version_2_with_negative,
            n_best_size=args.n_best_size,
            max_answer_length=args.max_answer_length,
            start_n_top=model.config.start_n_top,
            end_n_top=model.config.end_n_top,
            output_dir=args.output_dir,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if args.version_2_with_negative:
            formatted_predictions = [{
                "id":
                k,
                "prediction_text":
                v,
                "no_answer_probability":
                scores_diff_json[k]
            } for k, v in predictions.items()]
        else:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v
            } for k, v in predictions.items()]

        references = [{
            "id": ex["id"],
            "answers": ex[answer_column_name]
        } for ex in examples]
        return EvalPrediction(predictions=formatted_predictions,
                              label_ids=references)

    metric = load_metric(
        "squad_v2" if args.version_2_with_negative else "squad")

    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
        """
        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor

        Args:
            start_or_end_logits(:obj:`tensor`):
                This is the output predictions of the model. We can only enter either start or end logits.
            eval_dataset: Evaluation dataset
            max_len(:obj:`int`):
                The maximum length of the output tensor. ( See the model.eval() part for more details )
        """

        step = 0
        # create a numpy array and fill it with -100.
        logits_concat = np.full((len(dataset), max_len),
                                -100,
                                dtype=np.float32)
        # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
        for i, output_logit in enumerate(
                start_or_end_logits):  # populate columns
            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
            # And after every iteration we have to change the step

            batch_size = output_logit.shape[0]
            cols = output_logit.shape[1]
            if step + batch_size < len(dataset):
                logits_concat[step:step + batch_size, :cols] = output_logit
            else:
                logits_concat[step:, :cols] = output_logit[:len(dataset) -
                                                           step]

            step += batch_size

        return logits_concat

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(args.output_dir,
                                            save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}",
                    blocking=False,
                    auto_lfs_prune=True)

    # intialize all lists to collect the batches
    all_start_top_log_probs = []
    all_start_top_index = []
    all_end_top_log_probs = []
    all_end_top_index = []
    all_cls_logits = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
            start_top_log_probs = outputs.start_top_log_probs
            start_top_index = outputs.start_top_index
            end_top_log_probs = outputs.end_top_log_probs
            end_top_index = outputs.end_top_index
            cls_logits = outputs.cls_logits

            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
                start_top_log_probs = accelerator.pad_across_processes(
                    start_top_log_probs, dim=1, pad_index=-100)
                start_top_index = accelerator.pad_across_processes(
                    start_top_index, dim=1, pad_index=-100)
                end_top_log_probs = accelerator.pad_across_processes(
                    end_top_log_probs, dim=1, pad_index=-100)
                end_top_index = accelerator.pad_across_processes(
                    end_top_index, dim=1, pad_index=-100)
                cls_logits = accelerator.pad_across_processes(cls_logits,
                                                              dim=1,
                                                              pad_index=-100)

            all_start_top_log_probs.append(
                accelerator.gather(start_top_log_probs).cpu().numpy())
            all_start_top_index.append(
                accelerator.gather(start_top_index).cpu().numpy())
            all_end_top_log_probs.append(
                accelerator.gather(end_top_log_probs).cpu().numpy())
            all_end_top_index.append(
                accelerator.gather(end_top_index).cpu().numpy())
            all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy())

    max_len = max([x.shape[1] for x in all_end_top_log_probs
                   ])  # Get the max_length of the tensor

    # concatenate all numpy arrays collected above
    start_top_log_probs_concat = create_and_fill_np_array(
        all_start_top_log_probs, eval_dataset, max_len)
    start_top_index_concat = create_and_fill_np_array(all_start_top_index,
                                                      eval_dataset, max_len)
    end_top_log_probs_concat = create_and_fill_np_array(
        all_end_top_log_probs, eval_dataset, max_len)
    end_top_index_concat = create_and_fill_np_array(all_end_top_index,
                                                    eval_dataset, max_len)
    cls_logits_concat = np.concatenate(all_cls_logits, axis=0)

    # delete the list of numpy arrays
    del start_top_log_probs
    del start_top_index
    del end_top_log_probs
    del end_top_index
    del cls_logits

    outputs_numpy = (
        start_top_log_probs_concat,
        start_top_index_concat,
        end_top_log_probs_concat,
        end_top_index_concat,
        cls_logits_concat,
    )
    prediction = post_processing_function(eval_examples, eval_dataset,
                                          outputs_numpy)
    eval_metric = metric.compute(predictions=prediction.predictions,
                                 references=prediction.label_ids)
    logger.info(f"Evaluation metrics: {eval_metric}")

    if args.do_predict:
        # intialize all lists to collect the batches

        all_start_top_log_probs = []
        all_start_top_index = []
        all_end_top_log_probs = []
        all_end_top_index = []
        all_cls_logits = []
        for step, batch in enumerate(predict_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
                start_top_log_probs = outputs.start_top_log_probs
                start_top_index = outputs.start_top_index
                end_top_log_probs = outputs.end_top_log_probs
                end_top_index = outputs.end_top_index
                cls_logits = outputs.cls_logits

                if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
                    start_top_log_probs = accelerator.pad_across_processes(
                        start_top_log_probs, dim=1, pad_index=-100)
                    start_top_index = accelerator.pad_across_processes(
                        start_top_index, dim=1, pad_index=-100)
                    end_top_log_probs = accelerator.pad_across_processes(
                        end_top_log_probs, dim=1, pad_index=-100)
                    end_top_index = accelerator.pad_across_processes(
                        end_top_index, dim=1, pad_index=-100)
                    cls_logits = accelerator.pad_across_processes(
                        cls_logits, dim=1, pad_index=-100)

                all_start_top_log_probs.append(
                    accelerator.gather(start_top_log_probs).cpu().numpy())
                all_start_top_index.append(
                    accelerator.gather(start_top_index).cpu().numpy())
                all_end_top_log_probs.append(
                    accelerator.gather(end_top_log_probs).cpu().numpy())
                all_end_top_index.append(
                    accelerator.gather(end_top_index).cpu().numpy())
                all_cls_logits.append(
                    accelerator.gather(cls_logits).cpu().numpy())

        max_len = max([x.shape[1] for x in all_end_top_log_probs
                       ])  # Get the max_length of the tensor

        # concatenate all numpy arrays collected above
        start_top_log_probs_concat = create_and_fill_np_array(
            all_start_top_log_probs, predict_dataset, max_len)
        start_top_index_concat = create_and_fill_np_array(
            all_start_top_index, predict_dataset, max_len)
        end_top_log_probs_concat = create_and_fill_np_array(
            all_end_top_log_probs, predict_dataset, max_len)
        end_top_index_concat = create_and_fill_np_array(
            all_end_top_index, predict_dataset, max_len)
        cls_logits_concat = np.concatenate(all_cls_logits, axis=0)

        # delete the list of numpy arrays
        del start_top_log_probs
        del start_top_index
        del end_top_log_probs
        del end_top_index
        del cls_logits

        outputs_numpy = (
            start_top_log_probs_concat,
            start_top_index_concat,
            end_top_log_probs_concat,
            end_top_index_concat,
            cls_logits_concat,
        )

        prediction = post_processing_function(predict_examples,
                                              predict_dataset, outputs_numpy)
        predict_metric = metric.compute(predictions=prediction.predictions,
                                        references=prediction.label_ids)
        logger.info(f"Predict metrics: {predict_metric}")

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir,
                                        save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training",
                                 auto_lfs_prune=True)
Example #26
0
    'batch_size': 64,
    'tenacity': 5,
    'epoch_size': 4
}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        default='xlnet-base-cased',
                        help='model name or path')
    args = parser.parse_args()

    config = XLNetConfig.from_pretrained(args.model)
    model = XLNetModel.from_pretrained(args.model, config=config)
    tokenizer = XLNetTokenizer.from_pretrained(args.model)

    params_senteval['model'] = model.cuda().eval()
    params_senteval['tokenizer'] = tokenizer

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
    ]
    results = se.eval(transfer_tasks)
Example #27
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_qa_beam_search", model_args, data_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
        raw_datasets = load_dataset(
            extension,
            data_files=data_files,
            field="data",
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = XLNetConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = XLNetTokenizerFast.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = XLNetForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
        column_names = raw_datasets["validation"].column_names
    else:
        column_names = raw_datasets["test"].column_names
    question_column_name = "question" if "question" in column_names else column_names[
        0]
    context_column_name = "context" if "context" in column_names else column_names[
        1]
    answer_column_name = "answers" if "answers" in column_names else column_names[
        2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Training preprocessing
    def prepare_train_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [
            q.lstrip() for q in examples[question_column_name]
        ]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")
        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []
        tokenized_examples["is_impossible"] = []
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
            # The cls token gets 1.0 too (for predictions of empty answers).
            tokenized_examples["p_mask"].append([
                0.0 if (not special_tokens[i][k] and s == context_idx)
                or k == cls_index else 1.0 for k, s in enumerate(sequence_ids)
            ])

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                tokenized_examples["is_impossible"].append(1.0)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != context_idx:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != context_idx:
                    token_end_index -= 1
                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                    tokenized_examples["is_impossible"].append(1.0)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(
                        token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(
                        token_end_index + 1)
                    tokenized_examples["is_impossible"].append(0.0)

        return tokenized_examples

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            # Select samples from Dataset, This will help to decrease processing time
            max_train_samples = min(len(train_dataset),
                                    data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
        # Create Training Features
        with training_args.main_process_first(
                desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                prepare_train_features,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
        if data_args.max_train_samples is not None:
            # Select samples from dataset again since Feature Creation might increase number of features
            max_train_samples = min(len(train_dataset),
                                    data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
            # Find the CLS token in the input ids.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
            tokenized_examples["p_mask"].append([
                0.0 if (not special_tokens[i][k] and s == context_idx)
                or k == cls_index else 1.0 for k, s in enumerate(sequence_ids)
            ])

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(
                examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_idx else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if training_args.do_eval:
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_examples = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            # Selecting Eval Samples from Dataset
            max_eval_samples = min(len(eval_examples),
                                   data_args.max_eval_samples)
            eval_examples = eval_examples.select(range(max_eval_samples))
        # Create Features from Eval Dataset
        with training_args.main_process_first(
                desc="validation dataset map pre-processing"):
            eval_dataset = eval_examples.map(
                prepare_validation_features,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )
        if data_args.max_eval_samples is not None:
            # Selecting Samples from Dataset again since Feature Creation might increase samples size
            max_eval_samples = min(len(eval_dataset),
                                   data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))

    if training_args.do_predict:
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_examples = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(
                range(data_args.max_predict_samples))
        # Test Feature Creation
        with training_args.main_process_first(
                desc="prediction dataset map pre-processing"):
            predict_dataset = predict_examples.map(
                prepare_validation_features,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )
        if data_args.max_predict_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            max_predict_samples = min(len(predict_dataset),
                                      data_args.max_predict_samples)
            predict_dataset = predict_dataset.select(
                range(max_predict_samples))

    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
    # collator.
    data_collator = (default_data_collator if data_args.pad_to_max_length else
                     DataCollatorWithPadding(
                         tokenizer,
                         pad_to_multiple_of=8 if training_args.fp16 else None))

    # Post-processing:
    def post_processing_function(examples,
                                 features,
                                 predictions,
                                 stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=data_args.version_2_with_negative,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            start_n_top=model.config.start_n_top,
            end_n_top=model.config.end_n_top,
            output_dir=training_args.output_dir,
            log_level=log_level,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
            formatted_predictions = [{
                "id":
                k,
                "prediction_text":
                v,
                "no_answer_probability":
                scores_diff_json[k]
            } for k, v in predictions.items()]
        else:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v
            } for k, v in predictions.items()]

        references = [{
            "id": ex["id"],
            "answers": ex[answer_column_name]
        } for ex in examples]
        return EvalPrediction(predictions=formatted_predictions,
                              label_ids=references)

    metric = evaluate.load(
        "squad_v2" if data_args.version_2_with_negative else "squad")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions,
                              references=p.label_ids)

    # Initialize our Trainer
    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        eval_examples=eval_examples if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics = train_result.metrics

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate()

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Prediction
    if training_args.do_predict:
        logger.info("*** Predict ***")
        results = trainer.predict(predict_dataset, predict_examples)
        metrics = results.metrics

        max_predict_samples = (data_args.max_predict_samples
                               if data_args.max_predict_samples is not None
                               else len(predict_dataset))
        metrics["predict_samples"] = min(max_predict_samples,
                                         len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

    kwargs = {
        "finetuned_from": model_args.model_name_or_path,
        "tasks": "question-answering"
    }
    if data_args.dataset_name is not None:
        kwargs["dataset_tags"] = data_args.dataset_name
        if data_args.dataset_config_name is not None:
            kwargs["dataset_args"] = data_args.dataset_config_name
            kwargs[
                "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
        else:
            kwargs["dataset"] = data_args.dataset_name

    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)
def main():

    # Set device for PyTorch
    if torch.cuda.is_available():
        # might need to update when using more than 1 GPU
        rank = 0
        torch.cuda.set_device(rank)
        device = torch.device("cuda", rank)
        #torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cpu")
        n_gpu = 0

    print("N GPU: ", n_gpu)

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--model_id",
        type=str,
        help=
        "Model and optimizer should be saved at a folder inside '/gpfs/data/razavianlab/capstone19/models/{model_id}'. "
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        help=
        "Checkpoint number. Model and optimizer should be saved at '/gpfs/data/razavianlab/capstone19/models/{model_id}/model_checkpoint_{checkpoint}'. "
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--feature_save_dir",
        type=str,
        help=
        "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. "
    )

    parser.add_argument("--set_type",
                        type=str,
                        help="Specify train/test file.")

    args = parser.parse_args()

    # Load training data
    feature_save_path = os.path.join(
        '/gpfs/data/razavianlab/capstone19/preprocessed_data/',
        args.feature_save_dir)
    logger.info("Loading test dataset")
    test_dataloader = load_featurized_examples(
        batch_size=32,
        set_type=args.set_type,
        feature_save_path=feature_save_path)

    # Load saved model
    model_path = os.path.join('/gpfs/data/razavianlab/capstone19/models/',
                              args.model_id,
                              'model_checkpoint_' + args.checkpoint)
    logger.info("Loading saved model from {}".format(model_path))
    config = XLNetConfig.from_pretrained(
        os.path.join(model_path, 'config.json'),
        num_labels=2292)  # TODO: check if we need this
    model = XLNetForSequenceClassification.from_pretrained(model_path,
                                                           config=config)
    model.to(device)
    model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu)))

    summaries = torch.empty(0, config.d_model).to(device)
    all_doc_ids = torch.empty(0).to(device)
    all_label_ids = torch.empty(0, 2292).to(device)

    for i, batch in enumerate(test_dataloader):
        model.eval()
        with torch.no_grad():
            input_ids, input_mask, segment_ids, label_ids, doc_ids = batch

            input_ids = input_ids.to(device).long()
            input_mask = input_mask.to(device).long()
            segment_ids = segment_ids.to(device).long()
            doc_ids = doc_ids.to(device).float()
            label_ids = label_ids.to(device).float()
            transformer_outputs = model.module.transformer(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                input_mask=input_mask)

            output = transformer_outputs[0]
            # extracting the CLS token
            summary = output[:, 0]
            summary = summary.to(device)

            summaries = torch.cat([summaries, summary], dim=0)
            all_doc_ids = torch.cat([all_doc_ids, doc_ids], dim=0)
            all_label_ids = torch.cat([all_label_ids, label_ids], dim=0)

    # Average the representation of the CLS token for all examples from the same document
    mask = torch.zeros(int(all_doc_ids.max().item()) + 1, len(summaries))
    mask[all_doc_ids.long(), torch.arange(len(summaries))] = 1
    averaging_matrix = torch.nn.functional.normalize(mask, p=1,
                                                     dim=1).to(device)
    mean_summaries = torch.mm(averaging_matrix, summaries)

    # Create an object storing one copy of the labels per document
    last_doc_id = -1
    label_ids = torch.empty(0, all_label_ids.size()[1]).to(device)
    for (i, doc_id) in enumerate(all_doc_ids):
        if doc_id.item() != last_doc_id:
            label_ids = torch.cat([label_ids, all_label_ids[i].unsqueeze(0)])
            last_doc_id = doc_id.item()

    # Save the embedded representations of the document, along with the labels
    torch.save(
        mean_summaries,
        os.path.join(feature_save_path, args.set_type + '_summaries.pt'))
    torch.save(
        label_ids,
        os.path.join(feature_save_path, args.set_type + '_doc_label_ids.pt')
    )  # label_ids.pt has one record per window (and thus multiple records per document)

    return
Example #29
0
def train():
    # 加载预训练bert
    config = XLNetConfig.from_pretrained('xlnet_config.json')
    model = XLNetForQuestionAnswering.from_pretrained('xlnet_model.ckpt.index',
                                                      from_tf=True,
                                                      config=config)
    device = args.device
    model.to(device)

    # 准备 optimizer
    param_optimizer = list(model.named_parameters())
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = adabound.AdaBound(optimizer_grouped_parameters,
                                  lr=1e-3,
                                  final_lr=0.1)
    # 准备数据
    data = Dureader()
    train_dataloader, dev_dataloader = data.train_iter, data.dev_iter

    best_loss = 100000.0
    model.train()
    for i in range(args.num_train_epochs):
        for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch")):
            input_ids, input_mask, segment_ids, start_positions, end_positions = \
                                        batch.input_ids, batch.input_mask, batch.segment_ids, batch.start_position, batch.end_position
            input_ids, input_mask, segment_ids, start_positions, end_positions = \
                                        input_ids.to(device), input_mask.to(device), segment_ids.to(device), start_positions.to(device), end_positions.to(device)

            # 计算loss
            outputs = model(input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
            loss = outputs[0]
            loss = loss / args.gradient_accumulation_steps
            loss.backward()

            # 更新梯度
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            # 验证
            if step % args.log_step == 4:
                eval_loss = evaluate.evaluate(model, dev_dataloader)
                if eval_loss < best_loss:
                    best_loss = eval_loss
                    torch.save(model.state_dict(),
                               './model_dir/' + "best_model")
                    model.train()
def main():

    # Section: Set device for PyTorch
    if torch.cuda.is_available():
        # might need to update when using more than 1 GPU
        rank = 0
        torch.cuda.set_device(rank)
        device = torch.device("cuda", rank)
        #torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cpu")
        n_gpu = 0

    print("N GPU: ", n_gpu)
    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Indicate batch size")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--val_logging_step",
        default=100000,
        type=int,
        help="Number of steps in between logs of performance on validation set"
    )
    parser.add_argument(
        "--train_logging_step",
        default=1000,
        type=int,
        help="Number of steps in between logs of performance on training set")
    parser.add_argument("--save_step",
                        default=100000,
                        type=int,
                        help="Number of steps to save model parameters")
    parser.add_argument(
        "--model_id",
        type=str,
        help=
        "Model and optimizer will be saved at '/gpfs/data/razavianlab/capstone19/models/model_id'. "
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--feature_save_dir",
        type=str,
        help=
        "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/feature_save_dir'. "
    )
    parser.add_argument(
        "--model_type",
        default="base",
        type=str,
        help="Whether to use the xlnet base model or the xlnet large model")
    parser.add_argument("--learning_rate",
                        default=4e-5,
                        type=float,
                        help="Learning rate for optimizer")
    args = parser.parse_args()

    # Set random seed
    set_seeds(seed=args.seed, n_gpu=n_gpu)

    # Load data
    feature_save_path = os.path.join(
        '/gpfs/data/razavianlab/capstone19/preprocessed_data/',
        args.feature_save_dir)
    logger.info("Loading train dataset")
    train_dataloader = load_featurized_examples(
        args.batch_size, set_type="train", feature_save_path=feature_save_path)
    logger.info("Loading validation dataset")
    val_dataloader = load_featurized_examples(
        args.batch_size, set_type="val", feature_save_path=feature_save_path)

    # Load pretrained model
    num_train_optimization_steps = args.num_train_epochs * len(
        train_dataloader)

    if args.model_type == "large":
        config = XLNetConfig.from_pretrained('xlnet-large-cased',
                                             num_labels=2292)
        model = XLNetForSequenceClassification.from_pretrained(
            'xlnet-large-cased', config=config)
    else:
        config = XLNetConfig.from_pretrained(
            'xlnet-base-cased', num_labels=2292)  # TODO: check if we need this
        model = XLNetForSequenceClassification.from_pretrained(
            'xlnet-base-cased', config=config)

    model.to(device)

    optimizer, scheduler, model = initialize_optimizer(model, train_dataloader,
                                                       args)

    logger.info("***** Running training *****")
    logger.info("  Num batches = %d", len(train_dataloader))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Total train batch size  = %d", args.batch_size)
    logger.info("  Total optimization steps = %d",
                len(train_dataloader) * args.num_train_epochs)

    model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu)))
    train(train_dataloader=train_dataloader,
          val_dataloader=val_dataloader,
          model=model,
          optimizer=optimizer,
          scheduler=scheduler,
          num_train_epochs=args.num_train_epochs,
          n_gpu=n_gpu,
          device=device,
          model_id=args.model_id,
          save_step=args.save_step,
          train_logging_step=args.train_logging_step,
          val_logging_step=args.val_logging_step)