Python DistilBertForMultipleChoice Examples

Programming Language: Python

Namespace/Package Name: transformers

Examples at hotexamples.com: 4

Python DistilBertForMultipleChoice - 4 examples found. These are the top rated real world Python examples of transformers.DistilBertForMultipleChoice extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DistilBertForMultipleChoice(2)

eval(2)

from_pretrained(2)

to(2)

Example #1

Show file

File: test_modeling_distilbert.py Project: lego0901/pytea

 def create_and_check_distilbert_for_multiple_choice(
     self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     config.num_choices = self.num_choices
     model = DistilBertForMultipleChoice(config=config)
     model.to(torch_device)
     model.eval()
     multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
     multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
     result = model(
         multiple_choice_inputs_ids,
         attention_mask=multiple_choice_input_mask,
         labels=choice_labels,
     )
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))

Example #2

Show file

 def create_and_check_distilbert_for_multiple_choice(
         self, config, input_ids, input_mask, sequence_labels,
         token_labels, choice_labels):
     config.num_choices = self.num_choices
     model = DistilBertForMultipleChoice(config=config)
     model.to(torch_device)
     model.eval()
     multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(
         -1, self.num_choices, -1).contiguous()
     multiple_choice_input_mask = input_mask.unsqueeze(1).expand(
         -1, self.num_choices, -1).contiguous()
     loss, logits = model(
         multiple_choice_inputs_ids,
         attention_mask=multiple_choice_input_mask,
         labels=choice_labels,
     )
     result = {
         "loss": loss,
         "logits": logits,
     }
     self.parent.assertListEqual(list(result["logits"].size()),
                                 [self.batch_size, self.num_choices])
     self.check_loss_output(result)

Example #3

Show file

def main():
    # num_train_epochs = 8
    # train_batch_size = 8
    # max_seq_length = 512
    # learning_rate = 1e-5
    # warmup_proportion = 0.1
    # gradient_accumulation_steps = 4
    # data_dir = './Dataset/RACE/RACE/'

    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .csv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--bert_type",
                        default=None,
                        type=int,
                        required=True,
                        help="0:bert, 1: distilbert")

    ## Other parameters
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        default=False,
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
                 device, n_gpu, bool(args.local_rank != -1)))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        if args.do_train:
            raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = None
    if args.bert_type == 0:
        tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    elif args.bert_type == 1:
        tokenizer = DistilBertTokenizerFast.from_pretrained(args.bert_model)

    train_examples = None
    num_train_steps = None

    if args.do_train:
        train_dir = os.path.join(args.data_dir, 'train')
        train_examples = read_race_examples([train_dir + '/high', train_dir + '/middle'])
        num_train_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = None
    if args.bert_type == 0:
        model = BertForMultipleChoice.from_pretrained(args.bert_model,
                                                      cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(
                                                          args.local_rank),
                                                      num_choices=4)
    elif args.bert_type == 1:
        model = DistilBertForMultipleChoice.from_pretrained(args.bert_model)
    model.to(device)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

        # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    optimizer = None
    if args.bert_type == 0:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)
    elif args.bert_type == 1:
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, tokenizer, args.max_seq_length, True)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for ep in range(int(args.num_train_epochs)):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            count = 0
            logger.info("Trianing Epoch: {}/{}".format(ep + 1, int(args.num_train_epochs)))
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                if args.bert_type == 0:
                    loss = model(input_ids=input_ids,
                                   token_type_ids=segment_ids,
                                   attention_mask=input_mask,
                                   labels=label_ids)
                elif args.bert_type == 1:
                    result = model(input_ids=input_ids,
                                         # token_type_ids=segment_ids,
                                         attention_mask=input_mask,
                                         labels=label_ids)
                    loss = result['loss']
                    logits = result['logits']
                    #######
                    compare = np.array(label_ids.cpu()) == np.array(logits.argmax(axis=1).cpu())
                    count += np.sum(compare)
                    print("\nLabel: {}, Prediction: {}, Accuracy: {}"
                          .format(label_ids, logits.argmax(axis=1), count / (args.train_batch_size * (step + 1))))
                    #######
                if n_gpu > 1:
                    loss = loss.mean()
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                if global_step%100 == 0:
                    logger.info("Training loss: {}, global step: {}".format(tr_loss/nb_tr_steps, global_step))


            ## evaluate on dev set
            if global_step % 1000 == 0:
                dev_dir = os.path.join(args.data_dir, 'dev')
                dev_set = [dev_dir+'/high', dev_dir+'/middle']

                eval_examples = read_race_examples(dev_set)
                eval_features = convert_examples_to_features(
                    eval_examples, tokenizer, args.max_seq_length, True)
                logger.info("***** Running evaluation: Dev *****")
                logger.info("  Num examples = %d", len(eval_examples))
                logger.info("  Batch size = %d", args.eval_batch_size)
                all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
                all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
                all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
                all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
                eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                for step, batch in enumerate(eval_dataloader):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, label_ids = batch

                    with torch.no_grad():
                        if args.bert_type == 0:
                            tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                            logits = model(input_ids, segment_ids, input_mask)
                        elif args.bert_type == 1:
                            result = model(input_ids=input_ids,
                                           attention_mask=input_mask,
                                           labels=label_ids)
                            tmp_eval_loss = result['loss']
                            logits = result['logits']
                            logits = logits.argmax(axis=1)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    tmp_eval_accuracy = accuracy(logits, label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = eval_accuracy / nb_eval_examples

                result = {'dev_eval_loss': eval_loss,
                          'dev_eval_accuracy': eval_accuracy,
                          'global_step': global_step,
                          'loss': tr_loss/nb_tr_steps}

                output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
                with open(output_eval_file, "a+") as writer:
                    logger.info("***** Dev results *****")
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))

        # Save a trained model
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        torch.save(model_to_save.state_dict(), output_model_file)

    model.load_state_dict(torch.load(os.path.join(args.output_dir, "pytorch_model.bin")))
    if args.do_eval and args.local_rank == -1:
        test_dir = os.path.join(args.data_dir, 'test')
        test_high = [test_dir + '/high']
        test_middle = [test_dir + '/middle']

        ## test high
        eval_examples = read_race_examples(test_high)
        eval_features = convert_examples_to_features(
            eval_examples, tokenizer, args.max_seq_length, True)
        logger.info("***** Running evaluation: test high *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
        #
        model.eval()
        high_eval_loss, high_eval_accuracy = 0, 0
        high_nb_eval_steps, high_nb_eval_examples = 0, 0
        for step, batch in enumerate(eval_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            with torch.no_grad():
                if args.bert_type == 0:
                    tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                    logits = model(input_ids, segment_ids, input_mask)
                elif args.bert_type == 1:
                    result = model(input_ids=input_ids,
                                   attention_mask=input_mask,
                                   labels=label_ids)
                    tmp_eval_loss = result['loss']
                    logits = result['logits']
                    # logits = logits.argmax(axis=1)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            high_eval_loss += tmp_eval_loss.mean().item()
            high_eval_accuracy += tmp_eval_accuracy

            high_nb_eval_examples += input_ids.size(0)
            high_nb_eval_steps += 1

        eval_loss = high_eval_loss / high_nb_eval_steps
        eval_accuracy = high_eval_accuracy / high_nb_eval_examples

        result = {'high_eval_loss': eval_loss,
                  'high_eval_accuracy': eval_accuracy}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a+") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        ## test middle
        eval_examples = read_race_examples(test_middle)
        eval_features = convert_examples_to_features(
            eval_examples, tokenizer, args.max_seq_length, True)
        logger.info("***** Running evaluation: test middle *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        middle_eval_loss, middle_eval_accuracy = 0, 0
        middle_nb_eval_steps, middle_nb_eval_examples = 0, 0
        for step, batch in enumerate(eval_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            with torch.no_grad():
                if args.bert_type == 0:
                    results = model(input_ids=input_ids,
                                          token_type_ids=segment_ids,
                                          attention_mask=input_mask,
                                          labels=label_ids)
                elif args.bert_type == 1:
                    results = model(input_ids=input_ids,
                                   attention_mask=input_mask,
                                   labels=label_ids)
            tmp_eval_loss = results['loss']
            logits = results['logits']

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            middle_eval_loss += tmp_eval_loss.mean().item()
            middle_eval_accuracy += tmp_eval_accuracy

            middle_nb_eval_examples += input_ids.size(0)
            middle_nb_eval_steps += 1

        eval_loss = middle_eval_loss / middle_nb_eval_steps
        eval_accuracy = middle_eval_accuracy / middle_nb_eval_examples

        result = {'middle_eval_loss': eval_loss,
                  'middle_eval_accuracy': eval_accuracy}
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")

        with open(output_eval_file, "a+") as writer:
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        # all test
        eval_loss = (middle_eval_loss + high_eval_loss) / (middle_nb_eval_steps + high_nb_eval_steps)
        eval_accuracy = (middle_eval_accuracy + high_eval_accuracy) / (middle_nb_eval_examples + high_nb_eval_examples)

        result = {'overall_eval_loss': eval_loss,
                  'overall_eval_accuracy': eval_accuracy}

        with open(output_eval_file, "a+") as writer:
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

Example #4

Show file

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    train_examples = read_mctest_examples([train_dir + '.tsv', train_dir + '.ans'])
    num_train_steps = int(
        len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    train_features = convert_examples_to_features(
        train_examples, tokenizer, max_seq_length, True)

    train_data = MCTestDataset(train_features)

    train_sampler = RandomSampler(train_data)
    train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

    model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')

    model.to(device)
    model = torch.nn.DataParallel(model)

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    global_step = 0
    count = 0

    model.train()
    for epoch in range(num_train_epochs):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        print("Training Epoch: {}/{}".format(epoch + 1, int(num_train_epochs)))
        for step, batch in enumerate(train_loader):