def evaluate(args, model): """ Train the model """ dev_dataset = SequenceDataset( TextTokenIdsCache(args.preprocess_dir, f"{args.mode}-query"), args.max_seq_length) collate_fn = get_collate_function(args.max_seq_length) batch_size = args.pergpu_eval_batch_size if args.n_gpu > 1: batch_size *= args.n_gpu dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn=collate_fn) if args.n_gpu > 1: model = torch.nn.DataParallel(model) qembedding_memmap = np.memmap(args.qmemmap_path, dtype="float32", shape=(len(dev_dataset), 768), mode="w+") with torch.no_grad(): for step, (batch, qoffsets) in enumerate(tqdm(dev_dataloader)): batch = {k: v.to(args.model_device) for k, v in batch.items()} model.eval() embeddings = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], is_query=True) embeddings = embeddings.detach().cpu().numpy() qembedding_memmap[qoffsets] = embeddings return qembedding_memmap
def evaluate(args, model, tokenizer, prefix=""): eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) if args.mask_method == "None": eval_dataset = TopNDataset(args.topN_file, tokenizer, "dev.small", args.msmarco_dir, args.collection_memmap_dir, args.tokenize_dir, args.max_query_length, args.max_seq_length) collate_func = origin_dataset.get_collate_function() else: eval_dataset = RelevantDataset(tokenizer, "dev.small", args.msmarco_dir, args.collection_memmap_dir, args.tokenize_dir, args.max_query_length, args.max_seq_length) collate_func = adverse_dataset.get_collate_function( tokenizer, args.mask_method) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_dataloader = DataLoader(eval_dataset, batch_size=args.eval_batch_size, collate_fn=collate_func) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) cnt = 0 with open(args.output_score_path, 'w') as outputfile: for batch, qids, pids in tqdm(eval_dataloader): model.eval() batch = {k: v.to(args.device) for k, v in batch.items()} if args.mask_method == "attention_mask": batch['attention_mask_after_softmax_layer_set'] = list( range(model.config.num_hidden_layers)) with torch.no_grad(): outputs = model(**batch) scores = outputs[0].detach().cpu().numpy() for qid, pid, score in zip(qids, pids, scores[:, 1]): outputfile.write(f"{qid}\t{pid}\t{score}\n") cnt += 1
def evaluate(args, model, mode, prefix): eval_output_dir = args.eval_save_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataset = MSMARCODataset(mode, args.msmarco_dir, args.collection_memmap_dir, args.tokenize_dir, args.max_query_length, args.max_doc_length) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly collate_fn = get_collate_function(mode=mode) eval_dataloader = DataLoader(eval_dataset, batch_size=args.eval_batch_size, num_workers=args.data_num_workers, collate_fn=collate_fn) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) output_file_path = f"{eval_output_dir}/{prefix}.{mode}.score.tsv" with open(output_file_path, 'w') as outputfile: for batch, qids, docids in tqdm(eval_dataloader, desc="Evaluating"): model.eval() with torch.no_grad(): batch = {k: v.to(args.device) for k, v in batch.items()} outputs = model(**batch) scores = torch.diagonal(outputs[0]).detach().cpu().numpy() assert len(qids) == len(docids) == len(scores) for qid, docid, score in zip(qids, docids, scores): outputfile.write(f"{qid}\t{docid}\t{score}\n") rank_output = f"{eval_output_dir}/{prefix}.{mode}.rank.tsv" generate_rank(output_file_path, rank_output) if mode == "dev": mrr = eval_results(rank_output) return mrr
def evaluate(args, model, mode, prefix, eval_dataset=None): eval_output_dir = args.eval_save_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) if eval_dataset == None: eval_dataset = CLEARDataset(mode=mode, args=args) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly collate_fn = get_collate_function(mode=mode) eval_dataloader = DataLoader(eval_dataset, batch_size=args.eval_batch_size, num_workers=args.data_num_workers, pin_memory=True, collate_fn=collate_fn) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) output_file_path = f"{eval_output_dir}/{prefix}.{mode}.score.tsv" with open(output_file_path, 'w') as outputfile: for batch, qids, pids in tqdm(eval_dataloader, desc="Evaluating"): model.eval() with torch.no_grad(): batch = {k: v.to(args.device) for k, v in batch.items()} scores = model(**batch) assert len(qids) == len(pids) == len(scores) for qid, pid, score in zip(qids, pids, scores): outputfile.write(f"{qid}\t{pid}\t{score}\n") rank_output = f"{eval_output_dir}/{prefix}.{mode}.rank.tsv" generate_rank(output_file_path, rank_output) if mode == "dev.small": mrr = eval_results(rank_output) * 6980 / args.num_eval_queries return mrr
def evaluate(args, model, tokenizer, prefix=""): if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for key in args.keys: key_dir = f"{args.output_dir}/{key}" for layer_idx in range(model.config.num_hidden_layers + 1): layer_dir = f"{key_dir}/{layer_idx}" if not os.path.exists(layer_dir): os.makedirs(layer_dir) stop_words_set = load_stopwords(args.idf_path) eval_dataset = TopNDataset(args.rank_file, tokenizer, args.mode, args.msmarco_dir, args.collection_memmap_dir, args.tokenize_dir, args.max_query_length, args.max_seq_length) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_dataloader = DataLoader(eval_dataset, batch_size=args.eval_batch_size, collate_fn=get_collate_function()) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! print("***** Running evaluation {} *****".format(prefix)) print(" Num examples = %d", len(eval_dataset)) print(" Batch size = %d", args.eval_batch_size) for batch, qids, pids in tqdm(eval_dataloader): model.eval() batch = {k: v.to(args.device) for k, v in batch.items()} with torch.no_grad(): all_layers_hidden_states = model(**batch)[1] all_layers_hidden_states = [ h.detach().cpu().numpy() for h in all_layers_hidden_states ] save_to_disk(tokenizer, stop_words_set, all_layers_hidden_states, args, qids, pids, batch)
def train(args, model): """ Train the model """ tb_writer = SummaryWriter(args.log_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_dataset = MSMARCODataset("train", args.msmarco_dir, args.collection_memmap_dir, args.tokenize_dir, args.max_query_length, args.max_doc_length) # NOTE: Must Sequential! Pos, Neg, Pos, Neg, ... train_sampler = SequentialSampler(train_dataset) collate_fn = get_collate_function(mode="train") train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=args.data_num_workers, collate_fn=collate_fn) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for epoch_idx, _ in enumerate(train_iterator): epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, (batch, _, _) in enumerate(epoch_iterator): batch = {k: v.to(args.device) for k, v in batch.items()} model.train() outputs = model(**batch) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.evaluate_during_training and ( global_step % args.training_eval_steps == 0): mrr = evaluate(args, model, mode="dev", prefix="step_{}".format(global_step)) tb_writer.add_scalar('dev/MRR@10', mrr, global_step) if args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) cur_loss = (tr_loss - logging_loss) / args.logging_steps tb_writer.add_scalar('train/loss', cur_loss, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint save_model(model, args.model_save_dir, 'ckpt-{}'.format(global_step), args)