def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, corrects, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: def get_mask_idx(batch): mask_token = tokenizer.mask_token_id return [ list(batch[i]).index(mask_token) for i in range(batch.shape[0]) ] def compute_ranked_accuracy(query2answers): accurate = 0 total = 0 answers, batches = query2answers for batch in tqdm(batches, desc="Evaluating"): batch = torch.tensor(batch).to(torch.int64) batch = batch.to(args.device) prediction_scores = model(batch)[0] masked_indices = get_mask_idx(batch) prediction_scores = prediction_scores[ np.arange(prediction_scores.shape[0]), masked_indices, :] for i, (prediction, sample) in enumerate(zip(prediction_scores, batch)): key = " ".join( tokenizer.convert_ids_to_tokens( sample[1:masked_indices[i]])) correct_objects = answers[key] numb_correct_answers = len(correct_objects) predicted_ids = torch.argsort( prediction, dim=0, descending=True)[:numb_correct_answers] ranked_predictions = tokenizer.convert_ids_to_tokens( predicted_ids) accurate += len( set(ranked_predictions) & set(correct_objects)) / numb_correct_answers total += 1.0 return accurate / total model.eval() result = {} logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Batch size = %d", args.batch_size) for eval_type, query2answers in corrects.items(): with torch.no_grad(): accuracy = compute_ranked_accuracy(query2answers) accuracy = round(accuracy, 4) result[eval_type + '_ranked_acc'] = accuracy logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return result
def evaluate_model(model: PreTrainedModel, loader: DataLoader, device: torch.device) -> float: model.eval() total_loss = 0 for i, batch in enumerate(loader): for k, v in batch.items(): batch[k] = v.to(device) outputs = model(**batch) loss = outputs[0] total_loss += loss.item() return total_loss / len(loader.dataset)
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix='') -> Dict: eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, args.eval_batch_size, sampler=eval_sampler, collate_fn=collate, num_workers=args.num_workers) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(f' Num examples = {len(eval_dataset)}') logger.info(f' Batch size = {args.eval_batch_size}') eval_loss = 0.0 model.eval() for step, batch in enumerate(tqdm(eval_dataloader, desc='Evaluating')): inputs, labels = mask_tokens(batch, tokenizer, args) \ if args.mlm else (batch, batch) inputs, labels = inputs.to(args.device), labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) \ if args.mlm else model(inputs, labels=labels) loss = outputs[0] eval_loss += loss.mean().item() eval_loss = eval_loss / (step + 1) perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) args.eval_batch_size = args.per_gpu_eval_batch_size # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) # If some of the input is padded, then the attention mask is needed attention_mask = (inputs != tokenizer.pad_token_id) # word_tokens --> 1, pad_token --> 0 if attention_mask.all(): attention_mask = None with torch.no_grad(): outputs = model(inputs, attention_mask=attention_mask, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)).item() result = {"perplexity": perplexity} return result
def tsne(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer): tsne = TSNE() if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples): if tokenizer._pad_token is None: return (pad_sequence([d[0] for d in examples], batch_first=True), torch.tensor([d[1] for d in examples])) return (pad_sequence([d[0] for d in examples], batch_first=True, padding_value=tokenizer.pad_token_id), torch.tensor([d[1] for d in examples])) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) # train_dataloader = DataLoader( # train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) model.eval() X = None y = None for batch in tqdm(train_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch[0], tokenizer, args) if args.mlm else (batch[0], batch[0]) inputs = inputs.to(args.device) labels = labels.to(args.device) domain_labels = batch[1].to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels, domain_labels=domain_labels) if args.mlm else model(inputs, labels=labels, domain_labels=domain_labels) if X is not None: X = torch.cat((X, outputs[1].item()), 0) y = torch.cat((y, domain_labels.item()), 0) else: X = outputs[1].item() y = domain_labels.item() X_embedded = tsne.fit_transform(X) sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y, legend='full', palette=palette)
def __test(self, model: PreTrainedModel, data: DataLoader) -> (float, float, float, float, float, str): eval_loss = 0. eval_steps, eval_examples = 0, 0 tokens, eval_predictions, eval_labels = [], [], [] model.eval() for batch in tqdm(data): batch_tokens, batch_masks, batch_tags = tuple( t.to(self.device) for t in batch) with torch.no_grad(): outputs = model(batch_tokens, attention_mask=batch_masks, labels=batch_tags) logits = outputs[1].detach().cpu().numpy() label_ids = batch_tags.to('cpu').numpy() toks = batch_tokens.to('cpu').numpy() eval_loss += outputs[0].mean().item() batch_toks = [ self.tokenizer.convert_ids_to_tokens(sentence) for sentence in toks ] tokens.extend(batch_toks) eval_predictions.extend( [list(p) for p in np.argmax(logits, axis=2)]) eval_labels.extend(label_ids) eval_examples += batch_tokens.size(0) eval_steps += 1 eval_loss = eval_loss / eval_steps predicted_tags, valid_tags, tokens = self.translate( eval_predictions, eval_labels, tokens) score_acc = accuracy_score(valid_tags, predicted_tags) score_f1 = f1_score(valid_tags, predicted_tags) score_p = precision_score(valid_tags, predicted_tags) score_r = recall_score(valid_tags, predicted_tags) report = classification_report(valid_tags, predicted_tags) return eval_loss, score_acc, score_f1, score_p, score_r, report
def predict_task_split(self, model: transformers.PreTrainedModel, inputs: tf.data.Dataset, task: Task, max_length: int = 140, min_length: int = 55) -> typing.Sequence[typing.Sequence[int]]: try: outputs = [] model.to(self.device) for batch_inputs in tqdm.tqdm(inputs.as_numpy_iterator(), desc="Predicting %s" % task, unit="batch", leave=False): with torch.no_grad(): model.eval() forward_params = self.prepare_forward_inputs(model, batch_inputs) batch_outputs = model.generate(forward_params['input_ids'], attention_mask=forward_params['attention_mask'], do_sample=False, max_length=GENERATION_MAX_LENGTHS.get(task.dataset, max_length) + 2, min_length=GENERATION_MIN_LENGTHS.get(task.dataset, min_length) + 1, num_beams=4, length_penalty=2., no_repeat_ngram_size=3, early_stopping=True) batch_outputs = batch_outputs.detach().cpu().numpy() outputs.extend(batch_outputs) return outputs # We can't just except tf.errors.UnknownError, because it is thrown as some sort of weird proxy # instance of a tf.errors.UnknownError and python's pattern matching can't handle the scandal except Exception as e: if isinstance(e, tf.errors.UnknownError): logging.warning('Encountered error: %s on %s: %s', type(e), task, e) # Unfortunately, we don't get a more helpful error type, but this usually means # that the dataset has no labels for a given split (e.g., test evaluation occurs on a server) return [] else: # We got a different exception type so let python freak out accordingly logging.error('Encountered error: %s on %s: %s', type(e), task, e) raise e
def convert_to_onnx(model: PreTrainedModel, output_path, opset: int = 12): onnx_output_path = os.path.join(output_path, 'checkpoint_without_optimize.onnx') onnx_optimized_output_path = os.path.join(output_path, 'checkpoint_with_optimize.onnx') onnx_optimized_fp16_output_path = os.path.join( output_path, 'checkpoint_with_optimize_fp16.onnx') model.eval() with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes( tmp_model, tmp_tokenizer) ordered_input_names, model_args = ensure_valid_input( model, tokens, input_names) print(f"Model input names: {ordered_input_names}.") export(model, model_args, onnx_output_path, input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, verbose=True, opset_version=opset) print( f"Finished output checkpoint_without_optimize.onnx to {output_path}." ) optimized_model = optimizer.optimize_model(onnx_output_path, model_type='bert', num_heads=12, hidden_size=768, use_gpu=True) optimized_model.save_model_to_file(onnx_optimized_output_path) print(f"Finished output checkpoint_with_optimize.onnx to {output_path}.") optimized_model.convert_model_float32_to_float16() optimized_model.save_model_to_file(onnx_optimized_fp16_output_path) print( f"Finished output checkpoint_with_optimize_fp16.onnx to {output_path}." )
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) evaluation_loss = dict() eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) # adjusting eval batch size according to the number of train epochs to # make it easier to plot with same length for train and eval also for # the eval loss plot to be adjusted and clear within the plot frame # args.eval_batch_size = int(len(eval_dataset) / args.num_train_epochs) # commenting the actual one args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) \ if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) \ if args.mlm else model(inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 # write for each batch eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) print('\n--------------------------') result = { "perplexity": perplexity, "eval_loss": eval_loss, "eval_steps": nb_eval_steps } output_eval_file = os.path.join(FINETUNE_DIR, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) print('----------------------------') return result
def extract_feature(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = RandomSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Extracting features {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() sample_size = 0 summations = [] ## first time calculate summations for batch in tqdm(eval_dataloader, desc="Calculate Mean"): with torch.no_grad(): batch = batch.to(args.device) outputs = model(batch) logits, hidden_states = outputs tmp_hidden_states = [] if args.target.lower() == "cls": for i, state in enumerate(hidden_states): tmp_hidden_states.append(state[:, 0, :]) elif args.target == "words": for i, state in enumerate(hidden_states): state = state[:, 1:, :] mask = (batch[:, 1:] != 0).unsqueeze(2) state = mask * state state = torch.sum(state, dim=1) / torch.sum(mask, dim=1) tmp_hidden_states.append(state) elif args.target == "all": for i, state in enumerate(hidden_states): mask = (batch != 0).unsqueeze(2) state = mask * state state = torch.sum(state, dim=1) / torch.sum(mask, dim=1) tmp_hidden_states.append(state) else: raise NotImplementedError() hidden_states = tmp_hidden_states assert hidden_states[0].dim() == 2 if len(summations) == 0: for state in hidden_states: summations.append(torch.sum(state, dim=0)) assert len(summations) == len(hidden_states) else: for i, state in enumerate(hidden_states): summations[i] += torch.sum(state, dim=0) sample_size += len(hidden_states[0]) if sample_size >= args.max_sample_size: break # assert sample_size==len(eval_dataset) mean = [s / sample_size for s in summations] ## second time calculate variance # summations = [] # sample_size = 0 # for batch in tqdm(eval_dataloader, desc="Calculate Variance"): # with torch.no_grad(): # batch = batch.to(args.device) # outputs = model(batch) # logits, hidden_states = outputs # tmp_hidden_states = [] # if args.target.lower()=="cls": # for i, state in enumerate(hidden_states): # tmp_hidden_states.append(state[:,0,:]) # elif args.target=="words": # for i, state in enumerate(hidden_states): # state = state[:,1:,:] # mask = (batch[:,1:]!=0).unsqueeze(2) # state = mask*state # tmp_hidden_states.append(state) # elif args.target=="all": # for i, state in enumerate(hidden_states): # mask = (batch!=0).unsqueeze(2) # state = mask*state # tmp_hidden_states.append(state) # else: # raise NotImplementedError() # hidden_states = tmp_hidden_states # if args.target.lower()=="cls": # if len(summations)==0: # for state, m in zip(hidden_states, mean): # summations.append(torch.sum((state-m.unsqueeze(0))**2, dim=0)) # assert len(summations)==len(hidden_states) # else: # for i, (state, m) in enumerate(zip(hidden_states, mean)): # summations[i] += torch.sum((state-m.unsqueeze(0))**2, dim=0) # sample_size = len(eval_dataset) # elif args.target=="words" or args.target=="all": # if len(summations)==0: # for state, m in zip(hidden_states, mean): # delta = ((state-m.unsqueeze(0))**2)*mask # summations.append(torch.sum(delta, dim=(0, 1))) # assert len(summations)==len(hidden_states) # else: # for i, (state, m) in enumerate(zip(hidden_states, mean)): # delta = ((state-m.unsqueeze(0))**2)*mask # summations[i] += torch.sum(delta, dim=(0, 1)) # sample_size += torch.sum(mask) # else: # raise NotImplementedError # variance = [s/(sample_size) for s in summations] # statistic_variance = [s/(sample_size-1) for s in summations] # assert len(mean) == len(variance) == len(statistic_variance) == 13 # assert mean[0].shape == variance[0].shape == statistic_variance[0].shape == (768,) output_file = os.path.join(eval_output_dir, f"{prefix}.pkl") with open(output_file, "wb") as fout: logger.info(f"***** Saving features {prefix}.pkl *****") mean = np.array([m.to('cpu').numpy() for m in mean]) # variance = np.array([v.to('cpu').numpy() for v in variance]) # statistic_variance = np.array([v.to('cpu').numpy() for v in statistic_variance]) # pickle.dump({'mean': mean, # 'variance': variance, # 'statistic_variance': statistic_variance, # 'sample_size': sample_size}, # fout) pickle.dump({'mean': mean}, fout)
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: eval_output_dir = args.output_dir os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.dev_tsv is not None: from cs272_project.dataset.tsv_dataset import TSVDataset eval_dataset = TSVDataset(tokenizer, tsv_file=args.dev_tsv, batch_size=args.eval_batch_size, block_size=args.block_size) else: eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True, batch_size=args.eval_batch_size) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=None) eval_lm_loss = 0.0 eval_mc_loss = 0.0 nb_lm_eval_steps = 0 nb_mc_eval_steps = 0 model.eval() eval_iters = tqdm(eval_dataloader, desc="Evaluating", dynamic_ncols=True) for batch_lm, mc_labels in eval_iters: inputs, lm_labels = batch_lm, batch_lm inputs = inputs.to(args.device) lm_labels = lm_labels.to(args.device) mc_labels = mc_labels.to(args.device) with torch.no_grad(): outputs = model(inputs, lm_labels=lm_labels, mc_labels=mc_labels) lm_loss = torch.where(mc_labels == 1, outputs[0], torch.zeros_like(outputs[0])) mc_loss = outputs[1] eval_mc_loss += mc_loss.mean().item() nb_mc_eval_steps += 1 if lm_loss.mean().item() != 0.0: eval_lm_loss += lm_loss.mean().item() nb_lm_eval_steps += 1 if nb_lm_eval_steps == 0: mean_lm_loss = 0 else: mean_lm_loss = eval_lm_loss / nb_lm_eval_steps mean_mc_loss = eval_mc_loss / nb_mc_eval_steps ppl = 2**mean_lm_loss result = { "perplexity": ppl, "lm_loss": mean_lm_loss, "mc_loss": mean_mc_loss } output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") eval_stat = f"(eval) lm_loss: {result['lm_loss']:.3f}" \ f" mc_loss: {result['mc_loss']:.3f}" \ f" ppl: {result['perplexity']:.3f}" eval_iters.set_description(eval_stat) with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) return result
def export_pytorch( tokenizer: PreTrainedTokenizer, model: PreTrainedModel, config: OnnxConfig, opset: int, output: Path, ) -> Tuple[List[str], List[str]]: """ Export a PyTorch model to an ONNX Intermediate Representation (IR) Args: tokenizer ([`PreTrainedTokenizer`]): The tokenizer used for encoding the data. model ([`PreTrainedModel`]): The model to export. config ([`~onnx.config.OnnxConfig`]): The ONNX configuration associated with the exported model. opset (`int`): The version of the ONNX operator set to use. output (`Path`): Directory to store the exported ONNX model. Returns: `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from the ONNX configuration. """ if issubclass(type(model), PreTrainedModel): import torch from torch.onnx import export as onnx_export logger.info(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): model.config.return_dict = True model.eval() # Check if we need to override certain configuration item if config.values_override is not None: logger.info( f"Overriding {len(config.values_override)} configuration item(s)" ) for override_config_key, override_config_value in config.values_override.items( ): logger.info( f"\t- {override_config_key} -> {override_config_value}" ) setattr(model.config, override_config_key, override_config_value) # Ensure inputs match # TODO: Check when exporting QA we provide "is_pair=True" model_inputs = config.generate_dummy_inputs( tokenizer, framework=TensorType.PYTORCH) inputs_match, matched_inputs = ensure_model_and_config_inputs_match( model, model_inputs.keys()) onnx_outputs = list(config.outputs.keys()) if not inputs_match: raise ValueError("Model and config inputs doesn't match") config.patch_ops() # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11, # so we check the torch version for backwards compatibility if parse(torch.__version__) <= parse("1.10.99"): # export can work with named args but the dict containing named args # has to be the last element of the args tuple. onnx_export( model, (model_inputs, ), f=output.as_posix(), input_names=list(config.inputs.keys()), output_names=onnx_outputs, dynamic_axes={ name: axes for name, axes in chain(config.inputs.items(), config.outputs.items()) }, do_constant_folding=True, use_external_data_format=config.use_external_data_format( model.num_parameters()), enable_onnx_checker=True, opset_version=opset, ) else: onnx_export( model, (model_inputs, ), f=output.as_posix(), input_names=list(config.inputs.keys()), output_names=onnx_outputs, dynamic_axes={ name: axes for name, axes in chain(config.inputs.items(), config.outputs.items()) }, do_constant_folding=True, opset_version=opset, ) config.restore_ops() return matched_inputs, onnx_outputs
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) seqs, masks, genres = zip(*examples) token_ids, fact_embedding_ids = zip(*[ get_inputs(seq, mask, tokenizer) for seq, mask, genre in examples ]) labels = [get_labels(mask) for seq, mask, genre in examples] pad_seqs = pad_sequence(token_ids, batch_first=True, padding_value=tokenizer.pad_token_id) pad_factsembeds = pad_sequence(fact_embedding_ids, batch_first=True, padding_value=FACT_EMBEDS_PAD) pad_labels = pad_sequence(labels, batch_first=True, padding_value=DIST_LABELS_PAD) torch.stack(genres) return list(zip(pad_seqs, pad_factsembeds, pad_labels, genres)) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): if args.mlm: inputs, labels = mask_tokens(batch, tokenizer, args) with torch.no_grad(): outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() elif args.xlnet: with torch.no_grad(): seqs, factsembs, labels, genres = zip(*batch) tlabels = torch.stack(labels).to(args.device) tseqs = torch.stack(seqs).to(args.device) tgenres = torch.stack(genres).to(args.device) padding_masks = torch.where(tseqs == tokenizer.pad_token_id, torch.ones_like(tlabels), torch.zeros_like(tlabels)).to( args.device) outputs = model(tseqs, genre_idxs=tgenres, input_mask=padding_masks, labels=tlabels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() else: inputs, labels = (batch, batch) with torch.no_grad(): outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps print(f"validation loss value at step is {eval_loss}") logger.info(f"validation loss value at step is {eval_loss}") perplexity = torch.exp(torch.tensor(eval_loss)) result = {"evalloss": eval_loss} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, data_generator, tb_writer, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, global_step, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir criterion = nn.BCEWithLogitsLoss() eval_dataset = data_generator.instance_a_valid_dataset() if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(batch): # if tokenizer._pad_token is None: # return pad_sequence(examples, batch_first=True) # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) tokens = [b[0] for b in batch] features = [b[1] for b in batch] targets = [b[2] for b in batch] inputs = [b[3] for b in batch] lens = [len(x) for x in inputs] inputs = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id) attention_mask = (inputs != tokenizer.pad_token_id).int() tokens, features, targets = [ torch.tensor(x) for x in [tokens, features, targets] ] return tokens, features, targets, inputs, attention_mask, torch.tensor( lens).unsqueeze(1) if args.use_bucket_iterator: bucket_boundaries = [0, 20, 40, 60, 80, 101] eval_sampler = BySequenceLengthSampler(eval_dataset, bucket_boundaries, batch_size=args.eval_batch_size, drop_last=False) eval_dataloader = DataLoader(eval_dataset, batch_size=1, batch_sampler=eval_sampler, collate_fn=collate) else: eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate # if args.n_gpu > 1: # model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() preds, labels = [], [] for batch in tqdm(eval_dataloader, desc="Evaluating"): # training loop tokens, features, targets, inputs, attention_mask, lens = batch tokens, features, targets, inputs, attention_mask, lens = [ x.to(args.device) for x in [tokens, features, targets, inputs, attention_mask, lens] ] tokens, features, targets = [ x.float() for x in [tokens, features, targets] ] with torch.no_grad(): logit = model(tokens, features, inputs, attention_mask, lens) loss = criterion(logit, targets) pred = torch.sigmoid(logit).detach().cpu().numpy() labels.append(targets.long().detach().cpu().numpy()) preds.append(pred) eval_loss += loss.mean().item() nb_eval_steps += 1 labels = np.vstack(labels) preds = np.float64(np.vstack(preds)) aucprs = [] for i, engage in enumerate(["reply", "retweet", "comment", "like"]): _prauc = compute_prauc(preds[:, i], labels[:, i]) _rce = compute_rce(preds[:, i], labels[:, i]) aucprs.append(_prauc) print(engage + ":", _prauc, _rce) tb_writer.add_scalar('PRAUC/{}_val'.format(engage), _prauc, global_step) tb_writer.add_scalar('RCE/{}_val'.format(engage), _rce, global_step) print("Mean AUCPR : {}".format(sum(aucprs) / 4.0)) tb_writer.add_scalar('PRAUC/mean', sum(aucprs) / 4.0, global_step)
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() labels_file = str(args.eval_data_file).replace('masked_code_', 'mask_') labels_lines = [line.rstrip() for line in open(labels_file)] step = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): # Get the labels lines to process start = step * len(batch) end = start + len(batch) + 1 lables_to_process = labels_lines[start:end] step += 1 inputs, labels = read_masked_dataset(tokenizer, batch, lables_to_process) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) perfect_predictions, num_examples = get_number_perfect_predictions( model, tokenizer, args.eval_data_file) result = { "perplexity": perplexity, "loss": eval_loss, "perfect_predictions": perfect_predictions, "total_eval_examples": num_examples } wandb.log({'val_perplexity': perplexity, 'avg_val_loss': eval_loss}) wandb.log({'perfect_predictions': perfect_predictions}) wandb.log( {'perfect_predictions_percentage': perfect_predictions / num_examples}) output_eval_file = os.path.join( eval_output_dir, prefix, "eval_results_" + str(time.time()) + ".txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.early_stop > 0: # Early stop has been required by the user, check performance eval_results_files = glob.glob( os.path.join(eval_output_dir, prefix, 'eval_results_*.txt')) eval_results_files.sort( key=lambda x: os.stat(os.path.join(eval_output_dir, x)).st_mtime) if len(eval_results_files) > args.early_stop: perfect_predictions_before = read_perfect_predictions_from_file( eval_results_files[len(eval_results_files) - (args.early_stop + 1)]) if perfect_predictions <= perfect_predictions_before: return None return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="", data_split="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, data_split=data_split) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) nworkers = 16 eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=nworkers) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch_step, batch in enumerate(tqdm(eval_dataloader)): img, liwc, inputs, labels = batch inputs = inputs.to(args.device) labels = labels.to(args.device) img = img.unsqueeze(1).to(args.device) imgpos = None imgcls = None liwc = liwc.unsqueeze(1).to(args.device) with torch.no_grad(): lm_loss = 0. for cmt_i in range(1, args.num_cmts): curcondition = (img, imgpos, imgcls, liwc[:, :, cmt_i, :]) outputs = model( curcondition, inputs[:, :cmt_i * args.cmt_len], inputs[:, cmt_i * args.cmt_len:(cmt_i + 1) * args.cmt_len], labels=labels[:, cmt_i * args.cmt_len:(cmt_i + 1) * args.cmt_len]) lm_loss += outputs[0] if args.n_gpu > 1: lm_loss = lm_loss.mean() eval_loss += lm_loss.item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)).item() result = {"perplexity": perplexity, "eval_loss": eval_loss} return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 cls_loss = 0.0 kl_loss = 0.0 bow_loss = 0.0 nb_eval_steps = 0 model.eval() f = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): histories, responses, knowledges, kn_vocs, segments, chooses = batch histories = torch.LongTensor(histories).to(args.device) responses = torch.LongTensor(responses).to(args.device) knowledges = torch.LongTensor(knowledges).to(args.device) kn_vocs = torch.LongTensor(kn_vocs).to(args.device) segments = torch.LongTensor(segments).to(args.device) chooses = torch.FloatTensor(chooses).to(args.device) chooses = torch.cat([ -100 * torch.ones([histories.shape[0], histories.shape[-1] ]).float().to(args.device), chooses ], 1) lm_labels = torch.cat([ -100 * torch.ones([histories.shape[0], histories.shape[-1] ]).long().to(args.device), responses ], 1) lm_labels[lm_labels == 0] = -100 with torch.no_grad(): outputs, x_kn_att = model(input_ids=(histories, responses, knowledges, kn_vocs, chooses), lm_labels=lm_labels, token_type_ids=segments, use_posterior=False, use_bow=args.use_bow) loss = outputs[1] choose = torch.squeeze(x_kn_att) eval_loss += loss.mean().item() cls_loss += outputs[0].mean().item() #bow_loss += outputs[2].mean().item() #kl_loss += outputs[0].mean().item() nb_eval_steps += 1 if not f: logger.info(f"Choose: \n{choose}") f = 1 eval_loss = eval_loss / nb_eval_steps cls_loss = cls_loss / nb_eval_steps bow_loss = bow_loss / nb_eval_steps kl_loss = kl_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = { "perplexity": perplexity, "cls loss": cls_loss, "bow loss": bow_loss, "kl loss": kl_loss } output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def train(args, data, datasets, model: PreTrainedModel, original_model, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) train_datasets = datasets['train'] dev_datasets = datasets['dev'] train_dataloaders, train_example_num, train_distribution = create_dataloader( args, train_datasets, tokenizer, train=True) dev_dataloaders, dev_example_num, dev_distribution = create_dataloader( args, dev_datasets, tokenizer, train=False) train_iter_num = sum( [len(dataloader) for dataloader in train_dataloaders.values()]) dev_iter_num = sum( [len(dataloader) for dataloader in dev_dataloaders.values()]) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( train_iter_num // args.gradient_accumulation_steps) + 1 else: t_total = train_iter_num // args.gradient_accumulation_steps * args.num_train_epochs model = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model.resize_token_embeddings(len(tokenizer)) original_model = original_model.module if hasattr( original_model, "module" ) else original_model # Take care of distributed/parallel training original_model.resize_token_embeddings(len(tokenizer)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) original_model = torch.nn.DataParallel(original_model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) original_model = torch.nn.parallel.DistributedDataParallel( original_model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", train_example_num) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 best_loss = float('inf') best_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (train_iter_num // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( train_iter_num // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") model.zero_grad() original_model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) def inner_product(x, y): return torch.mean(torch.sum(y * x, 3)) def mean_square(x, y, idx): return torch.mean(torch.mean((y - x)**2, idx)) #return torch.mean(torch.sum((y - x) ** 2, 3)) def save_best_model(best_loss, best_step, dev_dataloaders): if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well eval_loss = evaluate(model, attributes_hiddens, dev_dataloaders) #eval_loss = evaluate(args, model, original_model, dev_dataloaders, dev_example_num, dev_distribution, criterion_mse, criterion_ip, feminine_hiddens, masculine_hiddens, gender_hiddens) logger.info(" global_step = %s, evaluate loss = %s", global_step, eval_loss) tb_writer.add_scalar("eval_loss", eval_loss, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) if eval_loss < best_loss: best_loss = eval_loss best_step = global_step checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-best") os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) #_rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) logger.info(" best_step = %s, best loss = %s", best_step, best_loss) return best_loss, best_step def get_hiddens_of_model(input): model.zero_grad() if args.model_type == 'roberta': _, _, hiddens = model.roberta(input) elif args.model_type == 'bert': _, _, hiddens = model.bert(input) elif args.model_type == 'albert': _, _, hiddens = model.albert(input) elif args.model_type == 'dbert': _, hiddens = model.distilbert(input) elif args.model_type == 'electra': _, hiddens = model.electra(input) elif args.model_type == 'gpt2': _, _, hiddens = model.transformer(input) elif args.model_type == 'gpt': _, hiddens = model.transformer(input) return hiddens def attribute_vector_example(): attributes_hiddens = {f'attribute{i}': [] for i in range(2)} dataloaders, _, distribution = create_dataloader(args, train_datasets, tokenizer, train=True) for key in distribution: if key != 'neutral': inputs, labels = next(dataloaders[key]) inputs = inputs.to(args.device) hiddens = get_hiddens_of_model(inputs) hiddens = torch.stack(hiddens, 2) if labels.size(1) > 1: onehot = torch.eye(hiddens.size(1)) zeros = torch.zeros(1, onehot.size(0)) onehot = torch.cat((zeros, onehot), 0) onehot = onehot[labels] onehot = torch.sum(onehot, 1) onehot = onehot.view(hiddens.size(0), -1, 1, 1) else: onehot = torch.eye(hiddens.size(1))[labels].view( hiddens.size(0), -1, 1, 1) onehot = onehot.to(args.device) attributes_hiddens[key].append( torch.sum(hiddens * onehot, 1) / labels.size(1)) # neutralも含まれている attribute_size = len(data['train']['example']) for i in range(attribute_size - 1): attributes_hiddens[f'attribute{i}'] = torch.mean( torch.cat(attributes_hiddens[f'attribute{i}'], 0), 0).detach().unsqueeze(0) return attributes_hiddens def forward(attributes_hiddens, dataloaders, key): inputs = next(dataloaders[key]) if len(inputs) == 2: inputs, labels = inputs labels = labels.to(args.device) else: labels = None inputs = inputs.to(args.device) if args.model_type == 'roberta': final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.roberta( inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.roberta( inputs) if args.token_loss: token_predicts = model.lm_head(final_layer_hiddens) token_original = original_model.lm_head( final_layer_original_hiddens) elif args.model_type == 'bert': final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.bert( inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.bert( inputs) if args.token_loss: token_predicts = model.cls(final_layer_hiddens) token_original = original_model.cls( final_layer_original_hiddens) elif args.model_type == 'albert': final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.albert( inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.albert( inputs) if args.token_loss: token_predicts = model.classifier(final_layer_hiddens) token_original = original_model.classifier( final_layer_original_hiddens) elif args.model_type == 'dbert': final_layer_hiddens, all_layer_hiddens = model.distilbert(inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, all_layer_original_hiddens = original_model.distilbert( inputs) if args.token_loss: token_predicts = model.classifier(final_layer_hiddens) token_original = original_model.classifier( final_layer_original_hiddens) elif args.model_type == 'electra': final_layer_hiddens, all_layer_hiddens = model.electra(inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, all_layer_original_hiddens = original_model.electra( inputs) if args.token_loss: hiddens = model.generator_predictions(final_layer_hiddens) token_predicts = model.generator_lm_head(hiddens) original_hiddens = original_model.generator_predictions( final_layer_original_hiddens) token_original = original_model.generator_lm_head( original_hiddens) elif args.model_type == 'gpt2': final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.transformer( inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.transformer( inputs) if args.token_loss: token_predicts = model.lm_head(final_layer_hiddens) token_original = original_model.lm_head( final_layer_original_hiddens) elif args.model_type == 'gpt': final_layer_hiddens, all_layer_hiddens = model.transformer(inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, all_layer_original_hiddens = original_model.transformer( inputs) if args.token_loss: token_predicts = model.lm_head(final_layer_hiddens) token_original = original_model.lm_head( final_layer_original_hiddens) all_layer_hiddens = torch.stack(all_layer_hiddens, 2) if 'neutral' != key: all_original_hiddens = torch.stack(all_layer_original_hiddens, 2) all_original_hiddens = all_original_hiddens.detach() if args.token_loss: original_hiddens - original_hiddens.detach() token_original = token_original.detach() if args.debias_layer == 'all': target_layer_hiddens = all_layer_hiddens target_original_hiddens = all_layer_hiddens else: if args.debias_layer == 'first': idx = 0 elif args.debias_layer == 'last': idx = -1 target_layer_hiddens = all_layer_hiddens[:, :, idx] target_layer_hiddens = target_layer_hiddens.unsqueeze(2) if 'neutral' != key: target_original_hiddens = all_original_hiddens[:, :, idx] target_original_hiddens = target_original_hiddens.unsqueeze(2) else: attributes_hiddens = { key: value[:, idx, :].unsqueeze(1) for key, value in attributes_hiddens.items() } if args.loss_target == 'sentence' or labels is None: attributes_hiddens = { key: value.unsqueeze(1) for key, value in attributes_hiddens.items() } #elif args.loss_target == 'token' and key == 'neutral': elif args.loss_target == 'token': if labels.size(1) > 1: onehot = torch.eye(target_layer_hiddens.size(1)) zeros = torch.zeros(1, onehot.size(0)) onehot = torch.cat((zeros, onehot), 0) onehot = onehot[labels] onehot = torch.sum(onehot, 1) onehot = onehot.view(target_layer_hiddens.size(0), -1, 1, 1) else: onehot = torch.eye(target_layer_hiddens.size(1))[labels].view( target_layer_hiddens.size(0), -1, 1, 1) onehot = onehot.to(args.device) target_layer_hiddens = torch.sum(target_layer_hiddens * onehot, 1).unsqueeze(1) / labels.size(1) if 'neutral' != key: target_original_hiddens = torch.sum( target_original_hiddens * onehot, 1).unsqueeze(1) / labels.size(1) else: attributes_hiddens = { key: value.expand(target_layer_hiddens.size(0), 1, value.size(1), value.size(2)) for key, value in attributes_hiddens.items() } if 'neutral' == key: loss = 0 for attribute_hiddens in attributes_hiddens.values(): tmp_loss = criterion_ip(target_layer_hiddens, attribute_hiddens) if args.square_loss: tmp_loss = tmp_loss**2 tmp_loss *= alpha loss += tmp_loss else: #loss = criterion_ms(target_layer_hiddens, target_original_hiddens) loss = criterion_ms(all_layer_hiddens, all_original_hiddens, 3) if args.token_loss: loss += criterion_ms(token_predicts, token_original, 2) #loss += criterion_ms(hiddens, original_hiddens, 2) loss *= beta return loss #def evaluate(args, model: PreTrainedModel, original_model, dev_dataloaders, dev_example_num, dev_distribution, criterion_mse, criterion_ip, feminine_hiddens, masculine_hiddens, gender_hiddens, prefix="") -> Dict: def evaluate(model, attributes_hiddens, dev_dataloaders, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", dev_example_num) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 model.eval() #criterion.eval() for key in tqdm(dev_distribution): with torch.no_grad(): loss = forward(attributes_hiddens, dev_dataloaders, key) eval_loss += loss.item() model.zero_grad() original_model.zero_grad() output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") ''' with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) logger.info(" Loss = %s", eval_loss) writer.write("Loss = %s\n" % (eval_loss)) ''' return eval_loss #criterion_ms = torch.nn.MSELoss() criterion_ms = mean_square #criterion.train() criterion_ip = inner_product original_model.eval() alpha, beta = args.weighted_loss alpha = float(alpha) beta = float(beta) train_loss = 0.0 for _ in train_iterator: random.shuffle(train_distribution) epoch_iterator = tqdm(train_distribution, desc="Iteration", disable=args.local_rank not in [-1, 0]) model.eval() with torch.no_grad(): attributes_hiddens = attribute_vector_example() for step, key in enumerate(epoch_iterator): model.train() # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue loss = forward(attributes_hiddens, train_dataloaders, key) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() original_model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logger.info(" global_step = %s, train loss = %s", global_step, train_loss) train_loss = 0.0 # Log metrics best_loss, best_step = save_best_model( best_loss, best_step, dev_dataloaders) dev_dataloaders, dev_example_num, dev_distribution = create_dataloader( args, dev_datasets, tokenizer, train=False) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break train_dataloaders, train_example_num, train_distribution = create_dataloader( args, train_datasets, tokenizer, train=True) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break dev_dataloaders, dev_example_num, dev_distribution = create_dataloader( args, dev_datasets, tokenizer, train=False) best_loss, best_step = save_best_model(best_loss, best_step, dev_dataloaders) if args.local_rank in [-1, 0]: tb_writer.close()
def export( tokenizer: PreTrainedTokenizer, model: PreTrainedModel, config: OnnxConfig, opset: int, output: Path ) -> Tuple[List[str], List[str]]: """ Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR Args: tokenizer: model: config: opset: output: Returns: """ if not is_torch_available(): raise ImportError("Cannot convert because PyTorch is not installed. Please install torch first.") import torch from torch.onnx import export from ..file_utils import torch_version if not is_torch_onnx_dict_inputs_support_available(): raise AssertionError(f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}") logger.info(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): model.config.return_dict = True model.eval() # Check if we need to override certain configuration item if config.values_override is not None: logger.info(f"Overriding {len(config.values_override)} configuration item(s)") for override_config_key, override_config_value in config.values_override.items(): logger.info(f"\t- {override_config_key} -> {override_config_value}") setattr(model.config, override_config_key, override_config_value) # Ensure inputs match # TODO: Check when exporting QA we provide "is_pair=True" model_inputs = config.generate_dummy_inputs(tokenizer, framework=TensorType.PYTORCH) inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys()) onnx_outputs = list(config.outputs.keys()) if not inputs_match: raise ValueError("Model and config inputs doesn't match") config.patch_ops() # export can works with named args but the dict containing named args as to be last element of the args tuple export( model, (model_inputs,), f=output.as_posix(), input_names=list(config.inputs.keys()), output_names=onnx_outputs, dynamic_axes={name: axes for name, axes in chain(config.inputs.items(), config.outputs.items())}, do_constant_folding=True, use_external_data_format=config.use_external_data_format(model.num_parameters()), enable_onnx_checker=True, opset_version=opset, ) config.restore_ops() return matched_inputs, onnx_outputs
def evaluate(args, eval_dataset: CoLDataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: torch.cuda.empty_cache() # # Loop to handle MNLI double evaluation (matched, mis-matched) # eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) args.eval_batch_size = args.per_gpu_eval_batch_size # Note that DistributedSampler samples randomly def col_collate(examples): tokens, vokens = zip(*examples) if tokenizer._pad_token is None: tokens = pad_sequence(tokens, batch_first=True) else: tokens = pad_sequence(tokens, batch_first=True, padding_value=tokenizer.pad_token_id) vokens = pad_sequence(vokens, batch_first=True, padding_value=-100) return tokens, vokens eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=col_collate ) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) total_token_loss = 0.0 total_voken_loss = 0.0 nb_eval_steps = 0 model.eval() for tokens, vokens in tqdm(eval_dataloader, desc="Evaluating"): token_inputs, token_labels, voken_labels = mask_tokens(tokens, vokens, tokenizer, args) token_inputs = token_inputs.to(args.device) token_labels = token_labels.to(args.device) if args.mlm_ratio != 0 else None voken_labels = voken_labels.to(args.device) # If some of the input is padded, then the attention mask is needed attention_mask = (token_inputs != tokenizer.pad_token_id) # word_tokens --> 1, pad_token --> 0 if attention_mask.all(): attention_mask = None with torch.no_grad(): outputs = model(token_inputs, attention_mask=attention_mask, masked_lm_labels=token_labels, voken_labels=voken_labels) voken_loss = outputs[0] token_loss = outputs[1] total_voken_loss += voken_loss.item() total_token_loss += token_loss.item() nb_eval_steps += 1 total_token_loss = total_token_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(total_token_loss)).item() result = {"perplexity": perplexity, "voken_loss": total_voken_loss / nb_eval_steps} torch.cuda.empty_cache() return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() acc = [] pos_loss_list = [] neg_loss_list = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): # for batch in batchs: batch['input_ids'] = batch['input_ids'].to(args.device) batch['masked_lm_labels'] = batch['masked_lm_labels'].to(args.device) batch['attention_mask'] = batch['attention_mask'].to(args.device) with torch.no_grad(): outputs = model(**batch) lm_loss = outputs[0] pos_loss = lm_loss[::2] neg_loss = lm_loss[1::2] # pos_loss = model(**batchs[0])[0] # neg_loss = model(**batchs[1])[0] acc.extend((pos_loss < neg_loss).long().tolist()) pos_loss_list.extend(pos_loss.tolist()) neg_loss_list.extend(neg_loss.tolist()) nb_eval_steps += 1 # eval_loss = eval_loss / nb_eval_steps # perplexity = torch.exp(torch.tensor(eval_loss)) result = { "acc": np.mean(acc), "pos_loss": np.mean(pos_loss_list), "neg_loss": np.mean(neg_loss_list) } output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, eval_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, run_batch_fn, desc="") -> Dict: if args.local_rank in [-1, 0]: eval_output_dir = args.output_dir os.makedirs(eval_output_dir, exist_ok=True) # eval_batch_size for selection must be 1 to handle variable number of candidates if args.task == "selection": args.eval_batch_size = 1 else: args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=eval_dataset.collate_fn) # multi-gpu evaluate if args.n_gpu > 1 and (args.task != "selection" or eval_dataset.args.eval_all_snippets): if not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) eval_loss = 0.0 nb_eval_steps = 0 model.eval() data_infos = [] all_preds = [] all_labels = [] for batch in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): with torch.no_grad(): loss, lm_logits, mc_logits, mc_labels = run_batch_fn( args, model, batch) if args.task == "detection": mc_logits = mc_logits.sigmoid() if args.task in ["selection", "detection"]: data_infos.append(batch[-1]) all_preds.append(mc_logits.detach().cpu().numpy()) all_labels.append(mc_labels.detach().cpu().numpy()) eval_loss += loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps if args.task.lower() == "generation" or "reconstruction": perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity, "loss": eval_loss} elif args.task.lower() == "selection": all_labels = np.array(all_labels).reshape(-1) all_pred_ids = np.array([np.argmax(logits) for logits in all_preds]) accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels) logger.info("Avg. # of candidates: %f", sum([len(arr[0]) for arr in all_preds]) / len(all_preds)) result = {"loss": eval_loss, "accuracy": accuracy} if args.output_file: sorted_pred_ids = [ np.argsort(logits.squeeze())[::-1] for logits in all_preds ] write_selection_preds(eval_dataset.dataset_walker, args.output_file, data_infos, sorted_pred_ids, topk=5) elif args.task.lower() == "detection": all_labels = np.concatenate(all_labels) all_pred_ids = (np.concatenate(all_preds) > 0.5) accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels) precision = sklearn.metrics.precision_score(all_labels, all_pred_ids) recall = sklearn.metrics.recall_score(all_labels, all_pred_ids) result = { "loss": eval_loss, "accuracy": accuracy, "precision": precision, "recall": recall } if args.output_file: write_detection_preds(eval_dataset.dataset_walker, args.output_file, data_infos, all_pred_ids) else: raise ValueError( "args.task not in ['generation', 'selection', 'detection'], got %s" % args.task) if args.local_rank in [-1, 0]: output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results %s *****" % desc) writer.write("***** Eval results %s *****\n" % desc) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True, doubling=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) labels = inputs.clone() labels[:, -1] = -100 inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, lm_labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} ###### Evaluate NSP accuracy eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) eval_dataset_second = load_and_cache_examples(args, tokenizer, evaluate=True, second=True) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) eval_correct_sampler = SequentialSampler(eval_dataset_second) eval_correct_dataloader = DataLoader(eval_dataset_second, sampler=eval_correct_sampler, batch_size=args.eval_batch_size, collate_fn=collate) eval_wrong_sampler = RandomSampler(eval_dataset_second) eval_wrong_dataloader = DataLoader(eval_dataset_second, sampler=eval_wrong_sampler, batch_size=args.eval_batch_size, collate_fn=collate) nb_eval_steps = 0 num_correctly_predicted = 0 num_wrongly_predicted = 0 for zipped_batch in tqdm(zip(eval_dataloader, eval_correct_dataloader, eval_wrong_dataloader), desc="Evaluating", total=len(eval_dataloader)): batch, correct_batch, wrong_batch = zipped_batch inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) second_input = None if_correct = False if random.randint(0, 1) == 1: second_input = correct_batch if_correct = True else: second_input = wrong_batch if_correct = False first_merged_inputs = torch.cat((inputs, second_input), 1) first_merged_inputs = first_merged_inputs.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels ) if args.mlm else model(first_merged_inputs) mc_logits = outputs[2].cpu() #print(mc_logits.shape) #print(if_correct, mc_logits) for jj in range(mc_logits.shape[0]): if (mc_logits[jj, 1] > mc_logits[jj, 0]) == if_correct: num_correctly_predicted += 1 else: num_wrongly_predicted += 1 nb_eval_steps += 1 total_predicted = num_correctly_predicted + num_wrongly_predicted accuracy = num_correctly_predicted / total_predicted result["accuracy"] = accuracy output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def save_preds(args, data_generator, tb_writer, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, global_step, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir criterion = nn.BCEWithLogitsLoss() eval_dataset = data_generator.instance_a_lb_dataset() if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(batch): # if tokenizer._pad_token is None: # return pad_sequence(examples, batch_first=True) # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) tokens = [b[0] for b in batch] features = [b[1] for b in batch] tweet_ids = [b[3] for b in batch] user_ids = [b[4] for b in batch] inputs = [b[2] for b in batch] lens = [len(x) for x in inputs] inputs = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id) attention_mask = (inputs != tokenizer.pad_token_id).int() tokens, features = [torch.tensor(x) for x in [tokens, features]] return tokens, features, tweet_ids, user_ids, inputs, attention_mask, torch.tensor( lens).unsqueeze(1) if args.use_bucket_iterator: bucket_boundaries = [0, 20, 40, 60, 80, 101] eval_sampler = BySequenceLengthSampler(eval_dataset, bucket_boundaries, batch_size=args.eval_batch_size, drop_last=False) eval_dataloader = DataLoader(eval_dataset, batch_size=1, batch_sampler=eval_sampler, collate_fn=collate) else: eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate # if args.n_gpu > 1: # model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) nb_eval_steps = 0 model.eval() tweets, users, preds = [], [], [] for batch in tqdm(eval_dataloader, desc="Evaluating"): # training loop tokens, features, tweet_ids, user_ids, inputs, attention_mask, lens = batch tokens, features, inputs, attention_mask, lens = [ x.to(args.device) for x in [tokens, features, inputs, attention_mask, lens] ] tokens, features = [x.float() for x in [tokens, features]] with torch.no_grad(): logit = model(tokens, features, inputs, attention_mask, lens) pred = torch.sigmoid(logit).detach().cpu().numpy() tweets += tweet_ids users += user_ids preds.append(pred) nb_eval_steps += 1 #if nb_eval_steps == 10: # break tweets = np.array(tweets) users = np.array(users) preds = np.float64(np.vstack(preds)) print(tweets.shape, users.shape, preds.shape) print(tweets[0:10]) print(users[0:10]) for i, engage in enumerate(["reply", "retweet", "comment", "like"]): preds_i = preds[:, i] print(preds_i.shape) with open( args.test_inference_path + "submission_{}.csv".format(engage), "w") as f: for k in range(preds_i.shape[0]): f.write( str(tweets[k]) + "," + str(users[k]) + "," + str(preds_i[k]) + "\n") print("Saved to csv the predictions for task {}".format(engage))
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir # MODIF FOR EVAL SCRIPT / USE CUSTOM DATASET #eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) vectorizer = VectorizeParagraph(tokenizer=tokenizer, block_size=GPT2_BLOCK_SIZE, mode=VectorizeMode.TRAIN, use_context=True, select_summary=lambda input_dict: random.choice(list(input_dict.values()))) eval_dataset = DatasetFromRepo(path=args.eval_data_file, transform=vectorizer) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[Tuple[torch.Tensor]]): all_inputs = [elt[0] for elt in examples] all_types = [elt[1] for elt in examples] all_labels = [elt[2] for elt in examples] padded_inputs = pad_sequence(all_inputs, batch_first=True, padding_value=tokenizer.pad_token_id) padded_types = pad_sequence(all_types, batch_first=True, padding_value=tokenizer.pad_token_id) padded_labels = pad_sequence(all_labels, batch_first=True, padding_value=-100) return padded_inputs, padded_types, padded_labels eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, types, labels = batch inputs = inputs.to(args.device) types = types.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, labels=labels, token_type_ids=types) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) def collate(examples: List[Dict]): inputs, inputs_type, labels = [], [], [] for sample in examples: inputs.append(sample['inputs']) inputs_type.append(sample['inputs_type']) labels.append(sample['label']) labels = torch.LongTensor(labels) if tokenizer._pad_token is None: return { 'inputs': pad_sequence(inputs, batch_first=True), 'inputs_type': pad_sequence(inputs_type, batch_first=True), 'label': labels } return { 'inputs': pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id), 'inputs_type': pad_sequence(inputs_type, batch_first=True, padding_value=tokenizer.pad_token_id), 'label': labels } eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate # if args.n_gpu > 1: # model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 n_correct, n_recall, n_precision = [0, 0], [0, 0], [0, 0] score_list, label_list = [], [] model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, inputs_type, labels = batch['inputs'], batch[ 'inputs_type'], batch['label'] inputs = inputs.to(args.device) inputs_type = inputs_type.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, token_type_ids=inputs_type, labels=labels) lm_loss, logits = outputs[0], outputs[1] eval_loss += lm_loss.mean().item() prediction = torch.max(logits.view(-1, 2), dim=-1)[1] # TODO: magic number n_correct_vec = prediction.eq(labels).float() n_correct[0] += n_correct_vec.sum() n_correct[1] += prediction.size(0) n_recall[0] += torch.sum(n_correct_vec * labels.eq(1).float()) n_recall[1] += labels.eq(1).float().sum() n_precision[0] += torch.sum(n_correct_vec * prediction.eq(1).float()) n_precision[1] += prediction.eq(1).float().sum() score_list += torch.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist() label_list += labels.detach().cpu().tolist() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps score_list, label_list = np.array(score_list), np.array(label_list) fpr, tpr, thresholds = metrics.roc_curve(label_list, score_list, pos_label=1) auc = metrics.auc(fpr, tpr) result = { "loss": eval_loss, 'AUC': auc, 'accuracy': n_correct[0] / n_correct[1], 'recall': n_recall[0] / n_recall[1], 'precision': n_precision[0] / n_precision[1] } output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="", debug=False) -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch_idx, batch in enumerate(tqdm(eval_dataloader, desc="Evaluating")): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 if debug and batch_idx == 10: break eval_loss = eval_loss / nb_eval_steps perplexity = np.exp(eval_loss) return eval_loss, perplexity
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, global_step = None, tr_loss = None, prefix="") -> Dict: # added global_step, tr_loss -TJ # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size #* max(1, args.n_gpu) # commented -TJ # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): # added second clause -TJ model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "a") as writer: # changed mode from w to a -TJ logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) #writer.write("%s = %s\n" % (key, str(result[key]))) # modifying what to log -TJ dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S") logstr ='{} Step {}: train loss = {:.3f}, valid loss = {:.3f}, valid perpl = {:.1f}\n'.format(dt_string, global_step, tr_loss, eval_loss, perplexity) writer.write(logstr) return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly if args.wiki_dataset: collate_fn = functools.partial(collate_wiki, tokenizer) else: collate_fn = functools.partial(collate, tokenizer) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn, ) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating", unit_scale=args.eval_batch_size, unit="examples"): if args.eval_subsampling != 1.0 and random.random( ) >= args.eval_subsampling: continue if args.wiki_dataset: if args.mlm: raise RuntimeError("Can't do mlm for wiki dataset") tokens, loss_mask = batch inputs, labels = (tokens, tokens) loss_mask = loss_mask.to(args.device) loss_weights = (~loss_mask) + loss_mask * args.title_scale inputs = inputs.to(args.device) labels = labels.to(args.device) outputs = model(inputs, labels=labels, loss_weights=loss_weights) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() else: inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) loss = torch.tensor(eval_loss) result = {"perplexity": perplexity, "loss": loss} if args.eval_creativity_blacklist: if not args.parsed_dictionary_dataset: raise RuntimeError( "Evaluating creativity blacklist with non-parsed dictionary dataset" ) blacklist = datasets.Blacklist.load(args.eval_creativity_blacklist) print( f"Evaluating creativity over {args.num_eval_creativity} words with {args.eval_creativity_batch_size} batch size" ) s = time.time() result.update( datasets.ParsedDictionaryDefinitionDataset.evaluate_creativity( tokenizer, model, blacklist, args.num_eval_creativity, args.eval_creativity_batch_size, max_length=args.block_size, )) print(f"Done evaluating creativity in {time.time() - s}s") output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result