Exemple #1
0
def gen(args):

    tokenizer = BertTokenizer.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model] )

    train_examples = data_utils.read_squad_examples(os.path.join(args.input_dir, "train.json"), is_training=True)
    
    train_features = data_utils.convert_examples_to_features(
        train_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True)
    logger.info("***** Running training *****")
    logger.info("  Num orig examples = %d", len(train_examples))
    logger.info("  Num split examples = %d", len(train_features))

    input_ids_np = np.array([f.input_ids for f in train_features], dtype=np.int16)
    segment_ids_np = np.array([f.segment_ids for f in train_features], dtype=np.int16)
    input_mask_np = np.array([f.input_mask for f in train_features], dtype=np.int16)
    start_positions_np = np.array([f.start_position for f in train_features], dtype=np.int16)
    end_positions_np = np.array([f.end_position for f in train_features], dtype=np.int16)

    np.savez_compressed(os.path.join(args.output_dir, "data.npz"), 
                        input_ids=input_ids_np, 
                        segment_ids = segment_ids_np, 
                        input_mask = input_mask_np, 
                        start_positions = start_positions_np, 
                        end_positions = end_positions_np)
    
    #>>>>> validation
    valid_examples=data_utils.read_squad_examples(os.path.join(args.input_dir,"dev.json"), is_training=True)

    valid_features = data_utils.convert_examples_to_features(
        valid_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True)
    
    logger.info("  Num orig examples = %d", len(valid_examples))
    logger.info("  Num split examples = %d", len(valid_features))

    valid_input_ids_np = np.array([f.input_ids for f in valid_features], dtype=np.int16)
    valid_segment_ids_np = np.array([f.segment_ids for f in valid_features], dtype=np.int16)
    valid_input_mask_np = np.array([f.input_mask for f in valid_features], dtype=np.int16)
    valid_start_positions_np = np.array([f.start_position for f in valid_features], dtype=np.int16)
    valid_end_positions_np = np.array([f.end_position for f in valid_features], dtype=np.int16)
    
    np.savez_compressed(os.path.join(args.output_dir, "dev.npz"), 
                        input_ids=valid_input_ids_np, 
                        segment_ids = valid_segment_ids_np, 
                        input_mask = valid_input_mask_np, 
                        start_positions = valid_start_positions_np, 
                        end_positions = valid_end_positions_np)
def test(args):  # Load a trained model that you have fine-tuned (we assume evaluate on cpu)
    tokenizer = BertTokenizer.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    
    eval_examples = data_utils.read_squad_examples(os.path.join(args.data_dir,"test.json"), is_training=False)

    eval_features = data_utils.convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=False)
    
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    
    eval_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_example_index)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    model = torch.load(os.path.join(args.output_dir, "model.pt") )
    model.cuda()
    model.eval()
    all_results = []
    for step, batch in enumerate(eval_dataloader):
        example_indices = batch[-1]
        batch = tuple(t.cuda() for t in batch[:-1])
        input_ids, segment_ids, input_mask= batch
        
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)

        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(data_utils.RawResult(unique_id=unique_id,
                                         start_logits=start_logits,
                                         end_logits=end_logits))
    output_prediction_file = os.path.join(args.output_dir, "predictions.json")
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
    data_utils.write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length,
                      True, output_prediction_file, output_nbest_file, False)
Exemple #3
0
def train(args):
    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    tokenizer = BertTokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])

    train_examples = data_utils.read_squad_examples(os.path.join(
        args.data_dir, "train.json"),
                                                    is_training=True)

    num_train_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs

    train_features = data_utils.convert_examples_to_features(
        train_examples,
        tokenizer,
        args.max_seq_length,
        args.doc_stride,
        args.max_query_length,
        is_training=True)
    logger.info("***** Running training *****")
    logger.info("  Num orig examples = %d", len(train_examples))
    logger.info("  Num split examples = %d", len(train_features))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_start_positions = torch.tensor(
        [f.start_position for f in train_features], dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in train_features],
                                     dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask,
                               all_start_positions, all_end_positions)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    # >>>>> validation
    if args.do_valid:
        valid_examples = data_utils.read_squad_examples(os.path.join(
            args.data_dir, "dev.json"),
                                                        is_training=True)

        valid_features = data_utils.convert_examples_to_features(
            valid_examples,
            tokenizer,
            args.max_seq_length,
            args.doc_stride,
            args.max_query_length,
            is_training=True)
        valid_all_input_ids = torch.tensor(
            [f.input_ids for f in valid_features], dtype=torch.long)
        valid_all_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_all_input_mask = torch.tensor(
            [f.input_mask for f in valid_features], dtype=torch.long)
        valid_all_start_positions = torch.tensor(
            [f.start_position for f in valid_features], dtype=torch.long)
        valid_all_end_positions = torch.tensor(
            [f.end_position for f in valid_features], dtype=torch.long)

        valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids,
                                   valid_all_input_mask,
                                   valid_all_start_positions,
                                   valid_all_end_positions)

        logger.info("***** Running validations *****")
        logger.info("  Num orig examples = %d", len(valid_examples))
        logger.info("  Num split examples = %d", len(valid_features))
        logger.info("  Batch size = %d", args.train_batch_size)

        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.train_batch_size)

        best_valid_loss = float('inf')
        valid_losses = []
    # <<<<< end of validation declaration
    if not args.bert_model.endswith(".pt"):
        model = BertForQuestionAnswering.from_pretrained(
            modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    else:
        model = torch.load(args.bert_model)

    if args.fp16:
        model.half()
    model.cuda()
    # Prepare optimizer
    param_optimizer = [(k, v) for k, v in model.named_parameters()
                       if v.requires_grad == True]
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    model.train()
    for _ in range(args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.cuda() for t in batch)
            input_ids, segment_ids, input_mask, start_positions, end_positions = batch
            loss = model(input_ids, segment_ids, input_mask, start_positions,
                         end_positions)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = args.learning_rate * warmup_linear(
                    global_step / t_total, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
            # >>>> perform validation at the end of each epoch .
        if args.do_valid:
            model.eval()
            with torch.no_grad():
                losses = []
                valid_size = 0
                for step, batch in enumerate(valid_dataloader):
                    batch = tuple(
                        t.cuda()
                        for t in batch)  # multi-gpu does scattering it-self
                    input_ids, segment_ids, input_mask, start_positions, end_positions = batch
                    loss = model(input_ids, segment_ids, input_mask,
                                 start_positions, end_positions)
                    losses.append(loss.data.item() * input_ids.size(0))
                    valid_size += input_ids.size(0)
                valid_loss = sum(losses) / valid_size
                logger.info("validation loss: %f", valid_loss)
                valid_losses.append(valid_loss)
            if valid_loss < best_valid_loss:
                torch.save(model, os.path.join(args.output_dir, "model.pt"))
                best_valid_loss = valid_loss
            model.train()
    if args.do_valid:
        with open(os.path.join(args.output_dir, "valid.json"), "w") as fw:
            json.dump({"valid_losses": valid_losses}, fw)
    else:
        torch.save(model, os.path.join(args.output_dir, "model.pt"))