Exemple #1
0
def evaluate(args, model):
    """ Train the model """
    dev_dataset = SequenceDataset(
        TextTokenIdsCache(args.preprocess_dir, f"{args.mode}-query"),
        args.max_seq_length)
    collate_fn = get_collate_function(args.max_seq_length)
    batch_size = args.pergpu_eval_batch_size
    if args.n_gpu > 1:
        batch_size *= args.n_gpu
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=batch_size,
                                collate_fn=collate_fn)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    qembedding_memmap = np.memmap(args.qmemmap_path,
                                  dtype="float32",
                                  shape=(len(dev_dataset), 768),
                                  mode="w+")
    with torch.no_grad():
        for step, (batch, qoffsets) in enumerate(tqdm(dev_dataloader)):
            batch = {k: v.to(args.model_device) for k, v in batch.items()}
            model.eval()
            embeddings = model(input_ids=batch["input_ids"],
                               attention_mask=batch["attention_mask"],
                               is_query=True)
            embeddings = embeddings.detach().cpu().numpy()
            qembedding_memmap[qoffsets] = embeddings
    return qembedding_memmap
Exemple #2
0
def evaluate(args, model, tokenizer, prefix=""):
    eval_output_dir = args.output_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)

    if args.mask_method == "None":
        eval_dataset = TopNDataset(args.topN_file, tokenizer, "dev.small",
                                   args.msmarco_dir,
                                   args.collection_memmap_dir,
                                   args.tokenize_dir, args.max_query_length,
                                   args.max_seq_length)
        collate_func = origin_dataset.get_collate_function()
    else:
        eval_dataset = RelevantDataset(tokenizer, "dev.small",
                                       args.msmarco_dir,
                                       args.collection_memmap_dir,
                                       args.tokenize_dir,
                                       args.max_query_length,
                                       args.max_seq_length)
        collate_func = adverse_dataset.get_collate_function(
            tokenizer, args.mask_method)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate_func)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    cnt = 0
    with open(args.output_score_path, 'w') as outputfile:
        for batch, qids, pids in tqdm(eval_dataloader):
            model.eval()
            batch = {k: v.to(args.device) for k, v in batch.items()}
            if args.mask_method == "attention_mask":
                batch['attention_mask_after_softmax_layer_set'] = list(
                    range(model.config.num_hidden_layers))
            with torch.no_grad():
                outputs = model(**batch)
                scores = outputs[0].detach().cpu().numpy()
                for qid, pid, score in zip(qids, pids, scores[:, 1]):
                    outputfile.write(f"{qid}\t{pid}\t{score}\n")
            cnt += 1
Exemple #3
0
def evaluate(args, model, mode, prefix):
    eval_output_dir = args.eval_save_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)

    eval_dataset = MSMARCODataset(mode, args.msmarco_dir,
                                  args.collection_memmap_dir,
                                  args.tokenize_dir, args.max_query_length,
                                  args.max_doc_length)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    collate_fn = get_collate_function(mode=mode)
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.eval_batch_size,
                                 num_workers=args.data_num_workers,
                                 collate_fn=collate_fn)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    output_file_path = f"{eval_output_dir}/{prefix}.{mode}.score.tsv"
    with open(output_file_path, 'w') as outputfile:
        for batch, qids, docids in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            with torch.no_grad():
                batch = {k: v.to(args.device) for k, v in batch.items()}
                outputs = model(**batch)
                scores = torch.diagonal(outputs[0]).detach().cpu().numpy()
                assert len(qids) == len(docids) == len(scores)
                for qid, docid, score in zip(qids, docids, scores):
                    outputfile.write(f"{qid}\t{docid}\t{score}\n")

    rank_output = f"{eval_output_dir}/{prefix}.{mode}.rank.tsv"
    generate_rank(output_file_path, rank_output)

    if mode == "dev":
        mrr = eval_results(rank_output)
        return mrr
Exemple #4
0
def evaluate(args, model, mode, prefix, eval_dataset=None):
    eval_output_dir = args.eval_save_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)

    if eval_dataset == None:
        eval_dataset = CLEARDataset(mode=mode, args=args)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    collate_fn = get_collate_function(mode=mode)
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.eval_batch_size,
                                 num_workers=args.data_num_workers,
                                 pin_memory=True,
                                 collate_fn=collate_fn)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    output_file_path = f"{eval_output_dir}/{prefix}.{mode}.score.tsv"
    with open(output_file_path, 'w') as outputfile:
        for batch, qids, pids in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            with torch.no_grad():
                batch = {k: v.to(args.device) for k, v in batch.items()}
                scores = model(**batch)
                assert len(qids) == len(pids) == len(scores)
                for qid, pid, score in zip(qids, pids, scores):
                    outputfile.write(f"{qid}\t{pid}\t{score}\n")

    rank_output = f"{eval_output_dir}/{prefix}.{mode}.rank.tsv"
    generate_rank(output_file_path, rank_output)

    if mode == "dev.small":
        mrr = eval_results(rank_output) * 6980 / args.num_eval_queries
        return mrr
Exemple #5
0
def evaluate(args, model, tokenizer, prefix=""):
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    for key in args.keys:
        key_dir = f"{args.output_dir}/{key}"
        for layer_idx in range(model.config.num_hidden_layers + 1):
            layer_dir = f"{key_dir}/{layer_idx}"
            if not os.path.exists(layer_dir):
                os.makedirs(layer_dir)

    stop_words_set = load_stopwords(args.idf_path)

    eval_dataset = TopNDataset(args.rank_file, tokenizer, args.mode,
                               args.msmarco_dir, args.collection_memmap_dir,
                               args.tokenize_dir, args.max_query_length,
                               args.max_seq_length)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=get_collate_function())

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    print("***** Running evaluation {} *****".format(prefix))
    print("  Num examples = %d", len(eval_dataset))
    print("  Batch size = %d", args.eval_batch_size)
    for batch, qids, pids in tqdm(eval_dataloader):
        model.eval()
        batch = {k: v.to(args.device) for k, v in batch.items()}
        with torch.no_grad():
            all_layers_hidden_states = model(**batch)[1]
            all_layers_hidden_states = [
                h.detach().cpu().numpy() for h in all_layers_hidden_states
            ]
            save_to_disk(tokenizer, stop_words_set, all_layers_hidden_states,
                         args, qids, pids, batch)
Exemple #6
0
def train(args, model):
    """ Train the model """
    tb_writer = SummaryWriter(args.log_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    train_dataset = MSMARCODataset("train", args.msmarco_dir,
                                   args.collection_memmap_dir,
                                   args.tokenize_dir, args.max_query_length,
                                   args.max_doc_length)

    # NOTE: Must Sequential! Pos, Neg, Pos, Neg, ...
    train_sampler = SequentialSampler(train_dataset)
    collate_fn = get_collate_function(mode="train")
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  num_workers=args.data_num_workers,
                                  collate_fn=collate_fn)

    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for epoch_idx, _ in enumerate(train_iterator):
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, (batch, _, _) in enumerate(epoch_iterator):

            batch = {k: v.to(args.device) for k, v in batch.items()}
            model.train()
            outputs = model(**batch)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if args.evaluate_during_training and (
                        global_step % args.training_eval_steps == 0):
                    mrr = evaluate(args,
                                   model,
                                   mode="dev",
                                   prefix="step_{}".format(global_step))
                    tb_writer.add_scalar('dev/MRR@10', mrr, global_step)
                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    cur_loss = (tr_loss - logging_loss) / args.logging_steps
                    tb_writer.add_scalar('train/loss', cur_loss, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    save_model(model, args.model_save_dir,
                               'ckpt-{}'.format(global_step), args)