Example #1
0
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
def evaluate(args,
             corrects,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    def get_mask_idx(batch):
        mask_token = tokenizer.mask_token_id
        return [
            list(batch[i]).index(mask_token) for i in range(batch.shape[0])
        ]

    def compute_ranked_accuracy(query2answers):

        accurate = 0
        total = 0
        answers, batches = query2answers
        for batch in tqdm(batches, desc="Evaluating"):
            batch = torch.tensor(batch).to(torch.int64)
            batch = batch.to(args.device)
            prediction_scores = model(batch)[0]
            masked_indices = get_mask_idx(batch)
            prediction_scores = prediction_scores[
                np.arange(prediction_scores.shape[0]), masked_indices, :]

            for i, (prediction,
                    sample) in enumerate(zip(prediction_scores, batch)):
                key = " ".join(
                    tokenizer.convert_ids_to_tokens(
                        sample[1:masked_indices[i]]))
                correct_objects = answers[key]
                numb_correct_answers = len(correct_objects)
                predicted_ids = torch.argsort(
                    prediction, dim=0, descending=True)[:numb_correct_answers]
                ranked_predictions = tokenizer.convert_ids_to_tokens(
                    predicted_ids)

                accurate += len(
                    set(ranked_predictions)
                    & set(correct_objects)) / numb_correct_answers
                total += 1.0

        return accurate / total

    model.eval()
    result = {}
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Batch size = %d", args.batch_size)
    for eval_type, query2answers in corrects.items():
        with torch.no_grad():
            accuracy = compute_ranked_accuracy(query2answers)
            accuracy = round(accuracy, 4)
            result[eval_type + '_ranked_acc'] = accuracy

    logger.info("***** Eval results {} *****".format(prefix))
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))

    return result
Example #3
0
def evaluate_model(model: PreTrainedModel, loader: DataLoader,
                   device: torch.device) -> float:
    model.eval()
    total_loss = 0
    for i, batch in enumerate(loader):

        for k, v in batch.items():
            batch[k] = v.to(device)

        outputs = model(**batch)
        loss = outputs[0]

        total_loss += loss.item()

    return total_loss / len(loader.dataset)
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix='') -> Dict:

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 args.eval_batch_size,
                                 sampler=eval_sampler,
                                 collate_fn=collate,
                                 num_workers=args.num_workers)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info(f'  Num examples = {len(eval_dataset)}')
    logger.info(f'  Batch size = {args.eval_batch_size}')
    eval_loss = 0.0
    model.eval()

    for step, batch in enumerate(tqdm(eval_dataloader, desc='Evaluating')):
        inputs, labels = mask_tokens(batch, tokenizer, args) \
            if args.mlm else (batch, batch)
        inputs, labels = inputs.to(args.device), labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) \
                if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]
            eval_loss += loss.mean().item()

    eval_loss = eval_loss / (step + 1)
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}
    return result
Example #5
0
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)
        # If some of the input is padded, then the attention mask is needed
        attention_mask = (inputs != tokenizer.pad_token_id)  # word_tokens --> 1, pad_token --> 0
        if attention_mask.all():
            attention_mask = None

        with torch.no_grad():
            outputs = model(inputs, attention_mask=attention_mask, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss)).item()

    result = {"perplexity": perplexity}

    return result
Example #6
0
def tsne(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
    tsne = TSNE()
    

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples):
        if tokenizer._pad_token is None:
            return (pad_sequence([d[0] for d in examples], batch_first=True), torch.tensor([d[1] for d in examples]))
        return (pad_sequence([d[0] for d in examples], batch_first=True, padding_value=tokenizer.pad_token_id), torch.tensor([d[1] for d in examples]))

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate)
    # train_dataloader = DataLoader(
    #     train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    model.eval()
    X = None
    y = None
    for batch in tqdm(train_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch[0], tokenizer, args) if args.mlm else (batch[0], batch[0])
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)
        domain_labels = batch[1].to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels, domain_labels=domain_labels) if args.mlm else model(inputs, labels=labels, domain_labels=domain_labels)
            if X is not None: 
                X = torch.cat((X, outputs[1].item()), 0)
                y = torch.cat((y, domain_labels.item()), 0)
            else: 
                X = outputs[1].item()
                y = domain_labels.item()


    X_embedded = tsne.fit_transform(X)
    sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y, legend='full', palette=palette)
Example #7
0
    def __test(self, model: PreTrainedModel,
               data: DataLoader) -> (float, float, float, float, float, str):
        eval_loss = 0.
        eval_steps, eval_examples = 0, 0
        tokens, eval_predictions, eval_labels = [], [], []
        model.eval()
        for batch in tqdm(data):
            batch_tokens, batch_masks, batch_tags = tuple(
                t.to(self.device) for t in batch)
            with torch.no_grad():
                outputs = model(batch_tokens,
                                attention_mask=batch_masks,
                                labels=batch_tags)
            logits = outputs[1].detach().cpu().numpy()
            label_ids = batch_tags.to('cpu').numpy()
            toks = batch_tokens.to('cpu').numpy()

            eval_loss += outputs[0].mean().item()
            batch_toks = [
                self.tokenizer.convert_ids_to_tokens(sentence)
                for sentence in toks
            ]
            tokens.extend(batch_toks)
            eval_predictions.extend(
                [list(p) for p in np.argmax(logits, axis=2)])
            eval_labels.extend(label_ids)

            eval_examples += batch_tokens.size(0)
            eval_steps += 1

        eval_loss = eval_loss / eval_steps

        predicted_tags, valid_tags, tokens = self.translate(
            eval_predictions, eval_labels, tokens)

        score_acc = accuracy_score(valid_tags, predicted_tags)
        score_f1 = f1_score(valid_tags, predicted_tags)
        score_p = precision_score(valid_tags, predicted_tags)
        score_r = recall_score(valid_tags, predicted_tags)
        report = classification_report(valid_tags, predicted_tags)

        return eval_loss, score_acc, score_f1, score_p, score_r, report
Example #8
0
    def predict_task_split(self,
                           model: transformers.PreTrainedModel,
                           inputs: tf.data.Dataset,
                           task: Task,
                           max_length: int = 140,
                           min_length: int = 55) -> typing.Sequence[typing.Sequence[int]]:

        try:
            outputs = []
            model.to(self.device)
            for batch_inputs in tqdm.tqdm(inputs.as_numpy_iterator(),
                                          desc="Predicting %s" % task,
                                          unit="batch", leave=False):
                with torch.no_grad():
                    model.eval()
                    forward_params = self.prepare_forward_inputs(model, batch_inputs)
                    batch_outputs = model.generate(forward_params['input_ids'],
                                                   attention_mask=forward_params['attention_mask'],
                                                   do_sample=False,
                                                   max_length=GENERATION_MAX_LENGTHS.get(task.dataset, max_length) + 2,
                                                   min_length=GENERATION_MIN_LENGTHS.get(task.dataset, min_length) + 1,
                                                   num_beams=4,
                                                   length_penalty=2.,
                                                   no_repeat_ngram_size=3,
                                                   early_stopping=True)

                    batch_outputs = batch_outputs.detach().cpu().numpy()
                    outputs.extend(batch_outputs)
            return outputs
        # We can't just except tf.errors.UnknownError, because it is thrown as some sort of weird proxy
        # instance of a tf.errors.UnknownError and python's pattern matching can't handle the scandal
        except Exception as e:
            if isinstance(e, tf.errors.UnknownError):
                logging.warning('Encountered error: %s on %s: %s', type(e), task, e)
                # Unfortunately, we don't get a more helpful error type, but this usually means
                # that the dataset has no labels for a given split (e.g., test evaluation occurs on a server)
                return []
            else:
                # We got a different exception type so let python freak out accordingly
                logging.error('Encountered error: %s on %s: %s', type(e), task, e)
                raise e
Example #9
0
def convert_to_onnx(model: PreTrainedModel, output_path, opset: int = 12):
    onnx_output_path = os.path.join(output_path,
                                    'checkpoint_without_optimize.onnx')
    onnx_optimized_output_path = os.path.join(output_path,
                                              'checkpoint_with_optimize.onnx')
    onnx_optimized_fp16_output_path = os.path.join(
        output_path, 'checkpoint_with_optimize_fp16.onnx')

    model.eval()
    with torch.no_grad():
        input_names, output_names, dynamic_axes, tokens = infer_shapes(
            tmp_model, tmp_tokenizer)
        ordered_input_names, model_args = ensure_valid_input(
            model, tokens, input_names)
        print(f"Model input names: {ordered_input_names}.")
        export(model,
               model_args,
               onnx_output_path,
               input_names=ordered_input_names,
               output_names=output_names,
               dynamic_axes=dynamic_axes,
               verbose=True,
               opset_version=opset)
        print(
            f"Finished output checkpoint_without_optimize.onnx to {output_path}."
        )

    optimized_model = optimizer.optimize_model(onnx_output_path,
                                               model_type='bert',
                                               num_heads=12,
                                               hidden_size=768,
                                               use_gpu=True)
    optimized_model.save_model_to_file(onnx_optimized_output_path)
    print(f"Finished output checkpoint_with_optimize.onnx to {output_path}.")
    optimized_model.convert_model_float32_to_float16()
    optimized_model.save_model_to_file(onnx_optimized_fp16_output_path)
    print(
        f"Finished output checkpoint_with_optimize_fp16.onnx to {output_path}."
    )
Example #10
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)

    evaluation_loss = dict()

    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    # adjusting eval batch size according to the number of train epochs to
    # make it easier to plot with same length for train and eval also for
    # the eval loss plot to be adjusted and clear within the plot frame
    # args.eval_batch_size = int(len(eval_dataset) / args.num_train_epochs)

    # commenting the actual one
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) \
            if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) \
                if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    # write for each batch
    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    print('\n--------------------------')

    result = {
        "perplexity": perplexity,
        "eval_loss": eval_loss,
        "eval_steps": nb_eval_steps
    }

    output_eval_file = os.path.join(FINETUNE_DIR, "eval_results.txt")
    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    print('----------------------------')

    return result
Example #11
0
def extract_feature(args,
                    model: PreTrainedModel,
                    tokenizer: PreTrainedTokenizer,
                    prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = RandomSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Extracting features {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    model.eval()

    sample_size = 0
    summations = []
    ## first time calculate summations
    for batch in tqdm(eval_dataloader, desc="Calculate Mean"):
        with torch.no_grad():
            batch = batch.to(args.device)
            outputs = model(batch)
            logits, hidden_states = outputs
            tmp_hidden_states = []
            if args.target.lower() == "cls":
                for i, state in enumerate(hidden_states):
                    tmp_hidden_states.append(state[:, 0, :])
            elif args.target == "words":
                for i, state in enumerate(hidden_states):
                    state = state[:, 1:, :]
                    mask = (batch[:, 1:] != 0).unsqueeze(2)
                    state = mask * state
                    state = torch.sum(state, dim=1) / torch.sum(mask, dim=1)
                    tmp_hidden_states.append(state)
            elif args.target == "all":
                for i, state in enumerate(hidden_states):
                    mask = (batch != 0).unsqueeze(2)
                    state = mask * state
                    state = torch.sum(state, dim=1) / torch.sum(mask, dim=1)
                    tmp_hidden_states.append(state)
            else:
                raise NotImplementedError()
            hidden_states = tmp_hidden_states
            assert hidden_states[0].dim() == 2
            if len(summations) == 0:
                for state in hidden_states:
                    summations.append(torch.sum(state, dim=0))
                assert len(summations) == len(hidden_states)
            else:
                for i, state in enumerate(hidden_states):
                    summations[i] += torch.sum(state, dim=0)
            sample_size += len(hidden_states[0])
            if sample_size >= args.max_sample_size:
                break

    # assert sample_size==len(eval_dataset)
    mean = [s / sample_size for s in summations]

    ## second time calculate variance
    #  summations = []
    #  sample_size = 0
    #  for batch in tqdm(eval_dataloader, desc="Calculate Variance"):
    #      with torch.no_grad():
    #          batch = batch.to(args.device)
    #          outputs = model(batch)
    #          logits, hidden_states = outputs
    #          tmp_hidden_states = []
    #          if args.target.lower()=="cls":
    #              for i, state in enumerate(hidden_states):
    #                  tmp_hidden_states.append(state[:,0,:])
    #          elif args.target=="words":
    #              for i, state in enumerate(hidden_states):
    #                  state = state[:,1:,:]
    #                  mask = (batch[:,1:]!=0).unsqueeze(2)
    #                  state = mask*state
    #                  tmp_hidden_states.append(state)
    #          elif args.target=="all":
    #              for i, state in enumerate(hidden_states):
    #                  mask = (batch!=0).unsqueeze(2)
    #                  state = mask*state
    #                  tmp_hidden_states.append(state)
    #          else:
    #              raise NotImplementedError()
    #          hidden_states = tmp_hidden_states
    #          if args.target.lower()=="cls":
    #              if len(summations)==0:
    #                  for state, m in zip(hidden_states, mean):
    #                      summations.append(torch.sum((state-m.unsqueeze(0))**2, dim=0))
    #                  assert len(summations)==len(hidden_states)
    #              else:
    #                  for i, (state, m) in enumerate(zip(hidden_states, mean)):
    #                      summations[i] += torch.sum((state-m.unsqueeze(0))**2, dim=0)
    #              sample_size = len(eval_dataset)
    #          elif args.target=="words" or args.target=="all":
    #              if len(summations)==0:
    #                  for state, m in zip(hidden_states, mean):
    #                      delta = ((state-m.unsqueeze(0))**2)*mask
    #                      summations.append(torch.sum(delta, dim=(0, 1)))
    #                  assert len(summations)==len(hidden_states)
    #              else:
    #                  for i, (state, m) in enumerate(zip(hidden_states, mean)):
    #                      delta = ((state-m.unsqueeze(0))**2)*mask
    #                      summations[i] += torch.sum(delta, dim=(0, 1))
    #              sample_size += torch.sum(mask)
    #          else:
    #              raise NotImplementedError
    #  variance = [s/(sample_size) for s in summations]
    #  statistic_variance = [s/(sample_size-1) for s in summations]
    #  assert len(mean) == len(variance) == len(statistic_variance) == 13
    #  assert mean[0].shape == variance[0].shape == statistic_variance[0].shape == (768,)
    output_file = os.path.join(eval_output_dir, f"{prefix}.pkl")
    with open(output_file, "wb") as fout:
        logger.info(f"***** Saving features {prefix}.pkl *****")
        mean = np.array([m.to('cpu').numpy() for m in mean])
        # variance = np.array([v.to('cpu').numpy() for v in variance])
        # statistic_variance = np.array([v.to('cpu').numpy() for v in statistic_variance])
        # pickle.dump({'mean': mean,
        #              'variance': variance,
        #              'statistic_variance': statistic_variance,
        #              'sample_size': sample_size},
        #             fout)
        pickle.dump({'mean': mean}, fout)
Example #12
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    eval_output_dir = args.output_dir
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    if args.dev_tsv is not None:
        from cs272_project.dataset.tsv_dataset import TSVDataset
        eval_dataset = TSVDataset(tokenizer,
                                  tsv_file=args.dev_tsv,
                                  batch_size=args.eval_batch_size,
                                  block_size=args.block_size)
    else:
        eval_dataset = load_and_cache_examples(args,
                                               tokenizer,
                                               evaluate=True,
                                               batch_size=args.eval_batch_size)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=None)

    eval_lm_loss = 0.0
    eval_mc_loss = 0.0
    nb_lm_eval_steps = 0
    nb_mc_eval_steps = 0
    model.eval()
    eval_iters = tqdm(eval_dataloader, desc="Evaluating", dynamic_ncols=True)
    for batch_lm, mc_labels in eval_iters:
        inputs, lm_labels = batch_lm, batch_lm
        inputs = inputs.to(args.device)
        lm_labels = lm_labels.to(args.device)
        mc_labels = mc_labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, lm_labels=lm_labels, mc_labels=mc_labels)
            lm_loss = torch.where(mc_labels == 1, outputs[0],
                                  torch.zeros_like(outputs[0]))
            mc_loss = outputs[1]
            eval_mc_loss += mc_loss.mean().item()
            nb_mc_eval_steps += 1
            if lm_loss.mean().item() != 0.0:
                eval_lm_loss += lm_loss.mean().item()
                nb_lm_eval_steps += 1

        if nb_lm_eval_steps == 0:
            mean_lm_loss = 0
        else:
            mean_lm_loss = eval_lm_loss / nb_lm_eval_steps
        mean_mc_loss = eval_mc_loss / nb_mc_eval_steps
        ppl = 2**mean_lm_loss

        result = {
            "perplexity": ppl,
            "lm_loss": mean_lm_loss,
            "mc_loss": mean_mc_loss
        }

        output_eval_file = os.path.join(eval_output_dir, prefix,
                                        "eval_results.txt")
        eval_stat = f"(eval) lm_loss: {result['lm_loss']:.3f}" \
                    f" mc_loss: {result['mc_loss']:.3f}" \
                    f" ppl: {result['perplexity']:.3f}"
        eval_iters.set_description(eval_stat)
        with open(output_eval_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Example #13
0
def export_pytorch(
    tokenizer: PreTrainedTokenizer,
    model: PreTrainedModel,
    config: OnnxConfig,
    opset: int,
    output: Path,
) -> Tuple[List[str], List[str]]:
    """
    Export a PyTorch model to an ONNX Intermediate Representation (IR)

    Args:
        tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer used for encoding the data.
        model ([`PreTrainedModel`]):
            The model to export.
        config ([`~onnx.config.OnnxConfig`]):
            The ONNX configuration associated with the exported model.
        opset (`int`):
            The version of the ONNX operator set to use.
        output (`Path`):
            Directory to store the exported ONNX model.

    Returns:
        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
        the ONNX configuration.
    """
    if issubclass(type(model), PreTrainedModel):
        import torch
        from torch.onnx import export as onnx_export

        logger.info(f"Using framework PyTorch: {torch.__version__}")
        with torch.no_grad():
            model.config.return_dict = True
            model.eval()

            # Check if we need to override certain configuration item
            if config.values_override is not None:
                logger.info(
                    f"Overriding {len(config.values_override)} configuration item(s)"
                )
                for override_config_key, override_config_value in config.values_override.items(
                ):
                    logger.info(
                        f"\t- {override_config_key} -> {override_config_value}"
                    )
                    setattr(model.config, override_config_key,
                            override_config_value)

            # Ensure inputs match
            # TODO: Check when exporting QA we provide "is_pair=True"
            model_inputs = config.generate_dummy_inputs(
                tokenizer, framework=TensorType.PYTORCH)
            inputs_match, matched_inputs = ensure_model_and_config_inputs_match(
                model, model_inputs.keys())
            onnx_outputs = list(config.outputs.keys())

            if not inputs_match:
                raise ValueError("Model and config inputs doesn't match")

            config.patch_ops()

            # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
            # so we check the torch version for backwards compatibility
            if parse(torch.__version__) <= parse("1.10.99"):
                # export can work with named args but the dict containing named args
                # has to be the last element of the args tuple.
                onnx_export(
                    model,
                    (model_inputs, ),
                    f=output.as_posix(),
                    input_names=list(config.inputs.keys()),
                    output_names=onnx_outputs,
                    dynamic_axes={
                        name: axes
                        for name, axes in chain(config.inputs.items(),
                                                config.outputs.items())
                    },
                    do_constant_folding=True,
                    use_external_data_format=config.use_external_data_format(
                        model.num_parameters()),
                    enable_onnx_checker=True,
                    opset_version=opset,
                )
            else:
                onnx_export(
                    model,
                    (model_inputs, ),
                    f=output.as_posix(),
                    input_names=list(config.inputs.keys()),
                    output_names=onnx_outputs,
                    dynamic_axes={
                        name: axes
                        for name, axes in chain(config.inputs.items(),
                                                config.outputs.items())
                    },
                    do_constant_folding=True,
                    opset_version=opset,
                )

            config.restore_ops()

    return matched_inputs, onnx_outputs
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)

        seqs, masks, genres = zip(*examples)

        token_ids, fact_embedding_ids = zip(*[
            get_inputs(seq, mask, tokenizer) for seq, mask, genre in examples
        ])
        labels = [get_labels(mask) for seq, mask, genre in examples]
        pad_seqs = pad_sequence(token_ids,
                                batch_first=True,
                                padding_value=tokenizer.pad_token_id)
        pad_factsembeds = pad_sequence(fact_embedding_ids,
                                       batch_first=True,
                                       padding_value=FACT_EMBEDS_PAD)
        pad_labels = pad_sequence(labels,
                                  batch_first=True,
                                  padding_value=DIST_LABELS_PAD)
        torch.stack(genres)
        return list(zip(pad_seqs, pad_factsembeds, pad_labels, genres))

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):

        if args.mlm:
            inputs, labels = mask_tokens(batch, tokenizer, args)
            with torch.no_grad():
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()
        elif args.xlnet:
            with torch.no_grad():
                seqs, factsembs, labels, genres = zip(*batch)
                tlabels = torch.stack(labels).to(args.device)
                tseqs = torch.stack(seqs).to(args.device)
                tgenres = torch.stack(genres).to(args.device)
                padding_masks = torch.where(tseqs == tokenizer.pad_token_id,
                                            torch.ones_like(tlabels),
                                            torch.zeros_like(tlabels)).to(
                                                args.device)

                outputs = model(tseqs,
                                genre_idxs=tgenres,
                                input_mask=padding_masks,
                                labels=tlabels)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()
        else:
            inputs, labels = (batch, batch)
            with torch.no_grad():
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()

        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    print(f"validation loss value at step is {eval_loss}")
    logger.info(f"validation loss value at step is {eval_loss}")
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"evalloss": eval_loss}

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Example #15
0
def evaluate(args,
             data_generator,
             tb_writer,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             global_step,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    criterion = nn.BCEWithLogitsLoss()

    eval_dataset = data_generator.instance_a_valid_dataset()

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(batch):
        # if tokenizer._pad_token is None:
        #     return pad_sequence(examples, batch_first=True)
        # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

        tokens = [b[0] for b in batch]
        features = [b[1] for b in batch]
        targets = [b[2] for b in batch]
        inputs = [b[3] for b in batch]

        lens = [len(x) for x in inputs]

        inputs = pad_sequence(inputs,
                              batch_first=True,
                              padding_value=tokenizer.pad_token_id)
        attention_mask = (inputs != tokenizer.pad_token_id).int()

        tokens, features, targets = [
            torch.tensor(x) for x in [tokens, features, targets]
        ]

        return tokens, features, targets, inputs, attention_mask, torch.tensor(
            lens).unsqueeze(1)

    if args.use_bucket_iterator:
        bucket_boundaries = [0, 20, 40, 60, 80, 101]
        eval_sampler = BySequenceLengthSampler(eval_dataset,
                                               bucket_boundaries,
                                               batch_size=args.eval_batch_size,
                                               drop_last=False)
        eval_dataloader = DataLoader(eval_dataset,
                                     batch_size=1,
                                     batch_sampler=eval_sampler,
                                     collate_fn=collate)
    else:
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size,
                                     collate_fn=collate)

    # multi-gpu evaluate
    # if args.n_gpu > 1:
    #    model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    preds, labels = [], []

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        # training loop
        tokens, features, targets, inputs, attention_mask, lens = batch

        tokens, features, targets, inputs, attention_mask, lens = [
            x.to(args.device)
            for x in [tokens, features, targets, inputs, attention_mask, lens]
        ]

        tokens, features, targets = [
            x.float() for x in [tokens, features, targets]
        ]

        with torch.no_grad():
            logit = model(tokens, features, inputs, attention_mask, lens)
            loss = criterion(logit, targets)

            pred = torch.sigmoid(logit).detach().cpu().numpy()
            labels.append(targets.long().detach().cpu().numpy())
            preds.append(pred)

            eval_loss += loss.mean().item()

        nb_eval_steps += 1

    labels = np.vstack(labels)
    preds = np.float64(np.vstack(preds))

    aucprs = []

    for i, engage in enumerate(["reply", "retweet", "comment", "like"]):
        _prauc = compute_prauc(preds[:, i], labels[:, i])
        _rce = compute_rce(preds[:, i], labels[:, i])

        aucprs.append(_prauc)

        print(engage + ":", _prauc, _rce)

        tb_writer.add_scalar('PRAUC/{}_val'.format(engage), _prauc,
                             global_step)
        tb_writer.add_scalar('RCE/{}_val'.format(engage), _rce, global_step)

    print("Mean AUCPR : {}".format(sum(aucprs) / 4.0))
    tb_writer.add_scalar('PRAUC/mean', sum(aucprs) / 4.0, global_step)
Example #16
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    labels_file = str(args.eval_data_file).replace('masked_code_', 'mask_')
    labels_lines = [line.rstrip() for line in open(labels_file)]

    step = 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        # Get the labels lines to process
        start = step * len(batch)
        end = start + len(batch) + 1
        lables_to_process = labels_lines[start:end]

        step += 1

        inputs, labels = read_masked_dataset(tokenizer, batch,
                                             lables_to_process)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))
    perfect_predictions, num_examples = get_number_perfect_predictions(
        model, tokenizer, args.eval_data_file)
    result = {
        "perplexity": perplexity,
        "loss": eval_loss,
        "perfect_predictions": perfect_predictions,
        "total_eval_examples": num_examples
    }

    wandb.log({'val_perplexity': perplexity, 'avg_val_loss': eval_loss})
    wandb.log({'perfect_predictions': perfect_predictions})
    wandb.log(
        {'perfect_predictions_percentage': perfect_predictions / num_examples})

    output_eval_file = os.path.join(
        eval_output_dir, prefix, "eval_results_" + str(time.time()) + ".txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    if args.early_stop > 0:
        # Early stop has been required by the user, check performance
        eval_results_files = glob.glob(
            os.path.join(eval_output_dir, prefix, 'eval_results_*.txt'))
        eval_results_files.sort(
            key=lambda x: os.stat(os.path.join(eval_output_dir, x)).st_mtime)
        if len(eval_results_files) > args.early_stop:
            perfect_predictions_before = read_perfect_predictions_from_file(
                eval_results_files[len(eval_results_files) -
                                   (args.early_stop + 1)])
            if perfect_predictions <= perfect_predictions_before:
                return None

    return result
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="",
             data_split="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args,
                                           tokenizer,
                                           data_split=data_split)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    eval_sampler = SequentialSampler(eval_dataset)
    nworkers = 16
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 num_workers=nworkers)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch_step, batch in enumerate(tqdm(eval_dataloader)):

        img, liwc, inputs, labels = batch
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)
        img = img.unsqueeze(1).to(args.device)
        imgpos = None
        imgcls = None

        liwc = liwc.unsqueeze(1).to(args.device)

        with torch.no_grad():
            lm_loss = 0.
            for cmt_i in range(1, args.num_cmts):
                curcondition = (img, imgpos, imgcls, liwc[:, :, cmt_i, :])
                outputs = model(
                    curcondition,
                    inputs[:, :cmt_i * args.cmt_len],
                    inputs[:, cmt_i * args.cmt_len:(cmt_i + 1) * args.cmt_len],
                    labels=labels[:, cmt_i * args.cmt_len:(cmt_i + 1) *
                                  args.cmt_len])
                lm_loss += outputs[0]
            if args.n_gpu > 1:
                lm_loss = lm_loss.mean()
            eval_loss += lm_loss.item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss)).item()

    result = {"perplexity": perplexity, "eval_loss": eval_loss}

    return result
Example #18
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    cls_loss = 0.0
    kl_loss = 0.0
    bow_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    f = 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        histories, responses, knowledges, kn_vocs, segments, chooses = batch
        histories = torch.LongTensor(histories).to(args.device)
        responses = torch.LongTensor(responses).to(args.device)
        knowledges = torch.LongTensor(knowledges).to(args.device)
        kn_vocs = torch.LongTensor(kn_vocs).to(args.device)
        segments = torch.LongTensor(segments).to(args.device)
        chooses = torch.FloatTensor(chooses).to(args.device)

        chooses = torch.cat([
            -100 * torch.ones([histories.shape[0], histories.shape[-1]
                               ]).float().to(args.device), chooses
        ], 1)
        lm_labels = torch.cat([
            -100 * torch.ones([histories.shape[0], histories.shape[-1]
                               ]).long().to(args.device), responses
        ], 1)
        lm_labels[lm_labels == 0] = -100

        with torch.no_grad():
            outputs, x_kn_att = model(input_ids=(histories, responses,
                                                 knowledges, kn_vocs, chooses),
                                      lm_labels=lm_labels,
                                      token_type_ids=segments,
                                      use_posterior=False,
                                      use_bow=args.use_bow)
            loss = outputs[1]
            choose = torch.squeeze(x_kn_att)

            eval_loss += loss.mean().item()
            cls_loss += outputs[0].mean().item()
            #bow_loss += outputs[2].mean().item()
            #kl_loss += outputs[0].mean().item()
        nb_eval_steps += 1

        if not f: logger.info(f"Choose: \n{choose}")
        f = 1
    eval_loss = eval_loss / nb_eval_steps
    cls_loss = cls_loss / nb_eval_steps
    bow_loss = bow_loss / nb_eval_steps
    kl_loss = kl_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {
        "perplexity": perplexity,
        "cls loss": cls_loss,
        "bow loss": bow_loss,
        "kl loss": kl_loss
    }

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Example #19
0
def train(args, data, datasets, model: PreTrainedModel, original_model,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    train_datasets = datasets['train']
    dev_datasets = datasets['dev']

    train_dataloaders, train_example_num, train_distribution = create_dataloader(
        args, train_datasets, tokenizer, train=True)
    dev_dataloaders, dev_example_num, dev_distribution = create_dataloader(
        args, dev_datasets, tokenizer, train=False)

    train_iter_num = sum(
        [len(dataloader) for dataloader in train_dataloaders.values()])
    dev_iter_num = sum(
        [len(dataloader) for dataloader in dev_dataloaders.values()])

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            train_iter_num // args.gradient_accumulation_steps) + 1
    else:
        t_total = train_iter_num // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    original_model = original_model.module if hasattr(
        original_model, "module"
    ) else original_model  # Take care of distributed/parallel training
    original_model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
        original_model = torch.nn.DataParallel(original_model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
        original_model = torch.nn.parallel.DistributedDataParallel(
            original_model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_example_num)
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    best_loss = float('inf')
    best_step = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (train_iter_num //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                train_iter_num // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    model.zero_grad()
    original_model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])

    def inner_product(x, y):
        return torch.mean(torch.sum(y * x, 3))

    def mean_square(x, y, idx):
        return torch.mean(torch.mean((y - x)**2, idx))
        #return torch.mean(torch.sum((y - x) ** 2, 3))

    def save_best_model(best_loss, best_step, dev_dataloaders):
        if (
                args.local_rank == -1 and args.evaluate_during_training
        ):  # Only evaluate when single GPU otherwise metrics may not average well
            eval_loss = evaluate(model, attributes_hiddens, dev_dataloaders)
            #eval_loss = evaluate(args, model, original_model, dev_dataloaders, dev_example_num, dev_distribution, criterion_mse, criterion_ip, feminine_hiddens, masculine_hiddens, gender_hiddens)
            logger.info(" global_step = %s, evaluate loss = %s", global_step,
                        eval_loss)
            tb_writer.add_scalar("eval_loss", eval_loss, global_step)
        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)

        if eval_loss < best_loss:
            best_loss = eval_loss
            best_step = global_step
            checkpoint_prefix = "checkpoint"
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir, "checkpoint-best")
            os.makedirs(output_dir, exist_ok=True)
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            torch.save(args, os.path.join(output_dir, "training_args.bin"))
            logger.info("Saving model checkpoint to %s", output_dir)

            #_rotate_checkpoints(args, checkpoint_prefix)

            torch.save(optimizer.state_dict(),
                       os.path.join(output_dir, "optimizer.pt"))
            torch.save(scheduler.state_dict(),
                       os.path.join(output_dir, "scheduler.pt"))
            logger.info("Saving optimizer and scheduler states to %s",
                        output_dir)
        logger.info(" best_step = %s, best loss = %s", best_step, best_loss)

        return best_loss, best_step

    def get_hiddens_of_model(input):
        model.zero_grad()
        if args.model_type == 'roberta':
            _, _, hiddens = model.roberta(input)
        elif args.model_type == 'bert':
            _, _, hiddens = model.bert(input)
        elif args.model_type == 'albert':
            _, _, hiddens = model.albert(input)
        elif args.model_type == 'dbert':
            _, hiddens = model.distilbert(input)
        elif args.model_type == 'electra':
            _, hiddens = model.electra(input)
        elif args.model_type == 'gpt2':
            _, _, hiddens = model.transformer(input)
        elif args.model_type == 'gpt':
            _, hiddens = model.transformer(input)

        return hiddens

    def attribute_vector_example():
        attributes_hiddens = {f'attribute{i}': [] for i in range(2)}

        dataloaders, _, distribution = create_dataloader(args,
                                                         train_datasets,
                                                         tokenizer,
                                                         train=True)
        for key in distribution:
            if key != 'neutral':
                inputs, labels = next(dataloaders[key])
                inputs = inputs.to(args.device)
                hiddens = get_hiddens_of_model(inputs)
                hiddens = torch.stack(hiddens, 2)
                if labels.size(1) > 1:
                    onehot = torch.eye(hiddens.size(1))
                    zeros = torch.zeros(1, onehot.size(0))
                    onehot = torch.cat((zeros, onehot), 0)
                    onehot = onehot[labels]
                    onehot = torch.sum(onehot, 1)
                    onehot = onehot.view(hiddens.size(0), -1, 1, 1)
                else:
                    onehot = torch.eye(hiddens.size(1))[labels].view(
                        hiddens.size(0), -1, 1, 1)
                onehot = onehot.to(args.device)
                attributes_hiddens[key].append(
                    torch.sum(hiddens * onehot, 1) / labels.size(1))

        # neutralも含まれている
        attribute_size = len(data['train']['example'])
        for i in range(attribute_size - 1):
            attributes_hiddens[f'attribute{i}'] = torch.mean(
                torch.cat(attributes_hiddens[f'attribute{i}'], 0),
                0).detach().unsqueeze(0)

        return attributes_hiddens

    def forward(attributes_hiddens, dataloaders, key):
        inputs = next(dataloaders[key])
        if len(inputs) == 2:
            inputs, labels = inputs
            labels = labels.to(args.device)
        else:
            labels = None
        inputs = inputs.to(args.device)
        if args.model_type == 'roberta':
            final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.roberta(
                inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.roberta(
                        inputs)
                if args.token_loss:
                    token_predicts = model.lm_head(final_layer_hiddens)
                    token_original = original_model.lm_head(
                        final_layer_original_hiddens)
        elif args.model_type == 'bert':
            final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.bert(
                inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.bert(
                        inputs)
                if args.token_loss:
                    token_predicts = model.cls(final_layer_hiddens)
                    token_original = original_model.cls(
                        final_layer_original_hiddens)
        elif args.model_type == 'albert':
            final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.albert(
                inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.albert(
                        inputs)
                if args.token_loss:
                    token_predicts = model.classifier(final_layer_hiddens)
                    token_original = original_model.classifier(
                        final_layer_original_hiddens)
        elif args.model_type == 'dbert':
            final_layer_hiddens, all_layer_hiddens = model.distilbert(inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, all_layer_original_hiddens = original_model.distilbert(
                        inputs)
                if args.token_loss:
                    token_predicts = model.classifier(final_layer_hiddens)
                    token_original = original_model.classifier(
                        final_layer_original_hiddens)
        elif args.model_type == 'electra':
            final_layer_hiddens, all_layer_hiddens = model.electra(inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, all_layer_original_hiddens = original_model.electra(
                        inputs)
                if args.token_loss:
                    hiddens = model.generator_predictions(final_layer_hiddens)
                    token_predicts = model.generator_lm_head(hiddens)
                    original_hiddens = original_model.generator_predictions(
                        final_layer_original_hiddens)
                    token_original = original_model.generator_lm_head(
                        original_hiddens)
        elif args.model_type == 'gpt2':
            final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.transformer(
                inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.transformer(
                        inputs)
                if args.token_loss:
                    token_predicts = model.lm_head(final_layer_hiddens)
                    token_original = original_model.lm_head(
                        final_layer_original_hiddens)
        elif args.model_type == 'gpt':
            final_layer_hiddens, all_layer_hiddens = model.transformer(inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, all_layer_original_hiddens = original_model.transformer(
                        inputs)
                if args.token_loss:
                    token_predicts = model.lm_head(final_layer_hiddens)
                    token_original = original_model.lm_head(
                        final_layer_original_hiddens)

        all_layer_hiddens = torch.stack(all_layer_hiddens, 2)
        if 'neutral' != key:
            all_original_hiddens = torch.stack(all_layer_original_hiddens, 2)
            all_original_hiddens = all_original_hiddens.detach()
            if args.token_loss:
                original_hiddens - original_hiddens.detach()
                token_original = token_original.detach()
        if args.debias_layer == 'all':
            target_layer_hiddens = all_layer_hiddens
            target_original_hiddens = all_layer_hiddens
        else:
            if args.debias_layer == 'first':
                idx = 0
            elif args.debias_layer == 'last':
                idx = -1
            target_layer_hiddens = all_layer_hiddens[:, :, idx]
            target_layer_hiddens = target_layer_hiddens.unsqueeze(2)
            if 'neutral' != key:
                target_original_hiddens = all_original_hiddens[:, :, idx]
                target_original_hiddens = target_original_hiddens.unsqueeze(2)
            else:
                attributes_hiddens = {
                    key: value[:, idx, :].unsqueeze(1)
                    for key, value in attributes_hiddens.items()
                }

        if args.loss_target == 'sentence' or labels is None:
            attributes_hiddens = {
                key: value.unsqueeze(1)
                for key, value in attributes_hiddens.items()
            }
        #elif args.loss_target == 'token' and key == 'neutral':
        elif args.loss_target == 'token':
            if labels.size(1) > 1:
                onehot = torch.eye(target_layer_hiddens.size(1))
                zeros = torch.zeros(1, onehot.size(0))
                onehot = torch.cat((zeros, onehot), 0)
                onehot = onehot[labels]
                onehot = torch.sum(onehot, 1)
                onehot = onehot.view(target_layer_hiddens.size(0), -1, 1, 1)
            else:
                onehot = torch.eye(target_layer_hiddens.size(1))[labels].view(
                    target_layer_hiddens.size(0), -1, 1, 1)
            onehot = onehot.to(args.device)
            target_layer_hiddens = torch.sum(target_layer_hiddens * onehot,
                                             1).unsqueeze(1) / labels.size(1)
            if 'neutral' != key:
                target_original_hiddens = torch.sum(
                    target_original_hiddens * onehot,
                    1).unsqueeze(1) / labels.size(1)
            else:
                attributes_hiddens = {
                    key: value.expand(target_layer_hiddens.size(0), 1,
                                      value.size(1), value.size(2))
                    for key, value in attributes_hiddens.items()
                }

        if 'neutral' == key:
            loss = 0
            for attribute_hiddens in attributes_hiddens.values():
                tmp_loss = criterion_ip(target_layer_hiddens,
                                        attribute_hiddens)
                if args.square_loss:
                    tmp_loss = tmp_loss**2
                tmp_loss *= alpha
                loss += tmp_loss
        else:
            #loss = criterion_ms(target_layer_hiddens, target_original_hiddens)
            loss = criterion_ms(all_layer_hiddens, all_original_hiddens, 3)
            if args.token_loss:
                loss += criterion_ms(token_predicts, token_original, 2)
                #loss += criterion_ms(hiddens, original_hiddens, 2)
            loss *= beta

        return loss

    #def evaluate(args, model: PreTrainedModel, original_model, dev_dataloaders, dev_example_num, dev_distribution, criterion_mse, criterion_ip, feminine_hiddens, masculine_hiddens, gender_hiddens, prefix="") -> Dict:
    def evaluate(model, attributes_hiddens, dev_dataloaders, prefix=""):
        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_output_dir = args.output_dir

        if args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir, exist_ok=True)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        # Note that DistributedSampler samples randomly

        # multi-gpu evaluate
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", dev_example_num)
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        model.eval()
        #criterion.eval()

        for key in tqdm(dev_distribution):
            with torch.no_grad():
                loss = forward(attributes_hiddens, dev_dataloaders, key)

                eval_loss += loss.item()

                model.zero_grad()
                original_model.zero_grad()

        output_eval_file = os.path.join(eval_output_dir, prefix,
                                        "eval_results.txt")
        '''
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            logger.info("  Loss = %s", eval_loss)
            writer.write("Loss = %s\n" % (eval_loss))
        '''

        return eval_loss

    #criterion_ms = torch.nn.MSELoss()
    criterion_ms = mean_square
    #criterion.train()
    criterion_ip = inner_product
    original_model.eval()

    alpha, beta = args.weighted_loss
    alpha = float(alpha)
    beta = float(beta)

    train_loss = 0.0

    for _ in train_iterator:

        random.shuffle(train_distribution)
        epoch_iterator = tqdm(train_distribution,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])

        model.eval()
        with torch.no_grad():
            attributes_hiddens = attribute_vector_example()

        for step, key in enumerate(epoch_iterator):
            model.train()

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            loss = forward(attributes_hiddens, train_dataloaders, key)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            train_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                original_model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logger.info(" global_step = %s, train loss = %s",
                                global_step, train_loss)
                    train_loss = 0.0
                    # Log metrics
                    best_loss, best_step = save_best_model(
                        best_loss, best_step, dev_dataloaders)
                    dev_dataloaders, dev_example_num, dev_distribution = create_dataloader(
                        args, dev_datasets, tokenizer, train=False)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
            train_dataloaders, train_example_num, train_distribution = create_dataloader(
                args, train_datasets, tokenizer, train=True)

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    dev_dataloaders, dev_example_num, dev_distribution = create_dataloader(
        args, dev_datasets, tokenizer, train=False)
    best_loss, best_step = save_best_model(best_loss, best_step,
                                           dev_dataloaders)

    if args.local_rank in [-1, 0]:
        tb_writer.close()
Example #20
0
def export(
    tokenizer: PreTrainedTokenizer, model: PreTrainedModel, config: OnnxConfig, opset: int, output: Path
) -> Tuple[List[str], List[str]]:
    """
    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR

    Args:
        tokenizer:
        model:
        config:
        opset:
        output:

    Returns:

    """
    if not is_torch_available():
        raise ImportError("Cannot convert because PyTorch is not installed. Please install torch first.")

    import torch
    from torch.onnx import export

    from ..file_utils import torch_version

    if not is_torch_onnx_dict_inputs_support_available():
        raise AssertionError(f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}")

    logger.info(f"Using framework PyTorch: {torch.__version__}")
    with torch.no_grad():
        model.config.return_dict = True
        model.eval()

        # Check if we need to override certain configuration item
        if config.values_override is not None:
            logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
            for override_config_key, override_config_value in config.values_override.items():
                logger.info(f"\t- {override_config_key} -> {override_config_value}")
                setattr(model.config, override_config_key, override_config_value)

        # Ensure inputs match
        # TODO: Check when exporting QA we provide "is_pair=True"
        model_inputs = config.generate_dummy_inputs(tokenizer, framework=TensorType.PYTORCH)
        inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
        onnx_outputs = list(config.outputs.keys())

        if not inputs_match:
            raise ValueError("Model and config inputs doesn't match")

        config.patch_ops()

        # export can works with named args but the dict containing named args as to be last element of the args tuple
        export(
            model,
            (model_inputs,),
            f=output.as_posix(),
            input_names=list(config.inputs.keys()),
            output_names=onnx_outputs,
            dynamic_axes={name: axes for name, axes in chain(config.inputs.items(), config.outputs.items())},
            do_constant_folding=True,
            use_external_data_format=config.use_external_data_format(model.num_parameters()),
            enable_onnx_checker=True,
            opset_version=opset,
        )

        config.restore_ops()

    return matched_inputs, onnx_outputs
def evaluate(args, eval_dataset: CoLDataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
    torch.cuda.empty_cache() 
    # # Loop to handle MNLI double evaluation (matched, mis-matched)
    # eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size
    # Note that DistributedSampler samples randomly

    def col_collate(examples):
        tokens, vokens = zip(*examples)
        if tokenizer._pad_token is None:
            tokens = pad_sequence(tokens, batch_first=True)
        else:
            tokens = pad_sequence(tokens, batch_first=True, padding_value=tokenizer.pad_token_id)
        vokens = pad_sequence(vokens, batch_first=True, padding_value=-100)
        return tokens, vokens

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=col_collate
    )

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    total_token_loss = 0.0
    total_voken_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for tokens, vokens in tqdm(eval_dataloader, desc="Evaluating"):
        token_inputs, token_labels, voken_labels = mask_tokens(tokens, vokens, tokenizer, args)
        token_inputs = token_inputs.to(args.device)
        token_labels = token_labels.to(args.device) if args.mlm_ratio != 0 else None
        voken_labels = voken_labels.to(args.device)
        # If some of the input is padded, then the attention mask is needed
        attention_mask = (token_inputs != tokenizer.pad_token_id)  # word_tokens --> 1, pad_token --> 0
        if attention_mask.all():
            attention_mask = None

        with torch.no_grad():
            outputs = model(token_inputs,
                            attention_mask=attention_mask,
                            masked_lm_labels=token_labels,
                            voken_labels=voken_labels)
            voken_loss = outputs[0]
            token_loss = outputs[1]

            total_voken_loss += voken_loss.item()
            total_token_loss += token_loss.item()

        nb_eval_steps += 1

    total_token_loss = total_token_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(total_token_loss)).item()

    result = {"perplexity": perplexity,
              "voken_loss": total_voken_loss / nb_eval_steps}
    torch.cuda.empty_cache() 

    return result
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    acc = []
    pos_loss_list = []
    neg_loss_list = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        # for batch in batchs:
        batch['input_ids'] = batch['input_ids'].to(args.device)
        batch['masked_lm_labels'] = batch['masked_lm_labels'].to(args.device)
        batch['attention_mask'] = batch['attention_mask'].to(args.device)

        with torch.no_grad():
            outputs = model(**batch)
            lm_loss = outputs[0]
            pos_loss = lm_loss[::2]
            neg_loss = lm_loss[1::2]
            # pos_loss = model(**batchs[0])[0]
            # neg_loss = model(**batchs[1])[0]
            acc.extend((pos_loss < neg_loss).long().tolist())
            pos_loss_list.extend(pos_loss.tolist())
            neg_loss_list.extend(neg_loss.tolist())
        nb_eval_steps += 1

    # eval_loss = eval_loss / nb_eval_steps
    # perplexity = torch.exp(torch.tensor(eval_loss))

    result = {
        "acc": np.mean(acc),
        "pos_loss": np.mean(pos_loss_list),
        "neg_loss": np.mean(neg_loss_list)
    }

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Example #23
0
def evaluate(args,
             eval_dataset,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             run_batch_fn,
             desc="") -> Dict:
    if args.local_rank in [-1, 0]:
        eval_output_dir = args.output_dir
        os.makedirs(eval_output_dir, exist_ok=True)

    # eval_batch_size for selection must be 1 to handle variable number of candidates
    if args.task == "selection":
        args.eval_batch_size = 1
    else:
        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=eval_dataset.collate_fn)

    # multi-gpu evaluate
    if args.n_gpu > 1 and (args.task != "selection"
                           or eval_dataset.args.eval_all_snippets):
        if not isinstance(model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model)

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    data_infos = []
    all_preds = []
    all_labels = []
    for batch in tqdm(eval_dataloader,
                      desc="Evaluating",
                      disable=args.local_rank not in [-1, 0]):
        with torch.no_grad():
            loss, lm_logits, mc_logits, mc_labels = run_batch_fn(
                args, model, batch)
            if args.task == "detection":
                mc_logits = mc_logits.sigmoid()
            if args.task in ["selection", "detection"]:
                data_infos.append(batch[-1])
            all_preds.append(mc_logits.detach().cpu().numpy())
            all_labels.append(mc_labels.detach().cpu().numpy())
            eval_loss += loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    if args.task.lower() == "generation" or "reconstruction":
        perplexity = torch.exp(torch.tensor(eval_loss))
        result = {"perplexity": perplexity, "loss": eval_loss}
    elif args.task.lower() == "selection":
        all_labels = np.array(all_labels).reshape(-1)
        all_pred_ids = np.array([np.argmax(logits) for logits in all_preds])
        accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels)
        logger.info("Avg. # of candidates: %f",
                    sum([len(arr[0]) for arr in all_preds]) / len(all_preds))
        result = {"loss": eval_loss, "accuracy": accuracy}
        if args.output_file:
            sorted_pred_ids = [
                np.argsort(logits.squeeze())[::-1] for logits in all_preds
            ]
            write_selection_preds(eval_dataset.dataset_walker,
                                  args.output_file,
                                  data_infos,
                                  sorted_pred_ids,
                                  topk=5)
    elif args.task.lower() == "detection":
        all_labels = np.concatenate(all_labels)
        all_pred_ids = (np.concatenate(all_preds) > 0.5)
        accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels)
        precision = sklearn.metrics.precision_score(all_labels, all_pred_ids)
        recall = sklearn.metrics.recall_score(all_labels, all_pred_ids)
        result = {
            "loss": eval_loss,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall
        }
        if args.output_file:
            write_detection_preds(eval_dataset.dataset_walker,
                                  args.output_file, data_infos, all_pred_ids)
    else:
        raise ValueError(
            "args.task not in ['generation', 'selection', 'detection'], got %s"
            % args.task)

    if args.local_rank in [-1, 0]:
        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results %s *****" % desc)
            writer.write("***** Eval results %s *****\n" % desc)
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Example #24
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args,
                                           tokenizer,
                                           evaluate=True,
                                           doubling=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer,
                                     args) if args.mlm else (batch, batch)

        labels = inputs.clone()
        labels[:, -1] = -100

        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, lm_labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    ###### Evaluate NSP accuracy
    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
    eval_dataset_second = load_and_cache_examples(args,
                                                  tokenizer,
                                                  evaluate=True,
                                                  second=True)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    eval_correct_sampler = SequentialSampler(eval_dataset_second)
    eval_correct_dataloader = DataLoader(eval_dataset_second,
                                         sampler=eval_correct_sampler,
                                         batch_size=args.eval_batch_size,
                                         collate_fn=collate)

    eval_wrong_sampler = RandomSampler(eval_dataset_second)
    eval_wrong_dataloader = DataLoader(eval_dataset_second,
                                       sampler=eval_wrong_sampler,
                                       batch_size=args.eval_batch_size,
                                       collate_fn=collate)

    nb_eval_steps = 0
    num_correctly_predicted = 0
    num_wrongly_predicted = 0
    for zipped_batch in tqdm(zip(eval_dataloader, eval_correct_dataloader,
                                 eval_wrong_dataloader),
                             desc="Evaluating",
                             total=len(eval_dataloader)):
        batch, correct_batch, wrong_batch = zipped_batch
        inputs, labels = mask_tokens(batch, tokenizer,
                                     args) if args.mlm else (batch, batch)

        second_input = None
        if_correct = False
        if random.randint(0, 1) == 1:
            second_input = correct_batch
            if_correct = True
        else:
            second_input = wrong_batch
            if_correct = False

        first_merged_inputs = torch.cat((inputs, second_input), 1)
        first_merged_inputs = first_merged_inputs.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels
                            ) if args.mlm else model(first_merged_inputs)
            mc_logits = outputs[2].cpu()
            #print(mc_logits.shape)
            #print(if_correct, mc_logits)
            for jj in range(mc_logits.shape[0]):
                if (mc_logits[jj, 1] > mc_logits[jj, 0]) == if_correct:
                    num_correctly_predicted += 1
                else:
                    num_wrongly_predicted += 1
        nb_eval_steps += 1

    total_predicted = num_correctly_predicted + num_wrongly_predicted
    accuracy = num_correctly_predicted / total_predicted
    result["accuracy"] = accuracy

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Example #25
0
def save_preds(args,
               data_generator,
               tb_writer,
               model: PreTrainedModel,
               tokenizer: PreTrainedTokenizer,
               global_step,
               prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    criterion = nn.BCEWithLogitsLoss()

    eval_dataset = data_generator.instance_a_lb_dataset()

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(batch):
        # if tokenizer._pad_token is None:
        #     return pad_sequence(examples, batch_first=True)
        # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

        tokens = [b[0] for b in batch]
        features = [b[1] for b in batch]
        tweet_ids = [b[3] for b in batch]
        user_ids = [b[4] for b in batch]
        inputs = [b[2] for b in batch]

        lens = [len(x) for x in inputs]

        inputs = pad_sequence(inputs,
                              batch_first=True,
                              padding_value=tokenizer.pad_token_id)
        attention_mask = (inputs != tokenizer.pad_token_id).int()

        tokens, features = [torch.tensor(x) for x in [tokens, features]]

        return tokens, features, tweet_ids, user_ids, inputs, attention_mask, torch.tensor(
            lens).unsqueeze(1)

    if args.use_bucket_iterator:
        bucket_boundaries = [0, 20, 40, 60, 80, 101]
        eval_sampler = BySequenceLengthSampler(eval_dataset,
                                               bucket_boundaries,
                                               batch_size=args.eval_batch_size,
                                               drop_last=False)
        eval_dataloader = DataLoader(eval_dataset,
                                     batch_size=1,
                                     batch_sampler=eval_sampler,
                                     collate_fn=collate)
    else:
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size,
                                     collate_fn=collate)

    # multi-gpu evaluate
    # if args.n_gpu > 1:
    #    model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    nb_eval_steps = 0
    model.eval()

    tweets, users, preds = [], [], []

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        # training loop
        tokens, features, tweet_ids, user_ids, inputs, attention_mask, lens = batch

        tokens, features, inputs, attention_mask, lens = [
            x.to(args.device)
            for x in [tokens, features, inputs, attention_mask, lens]
        ]

        tokens, features = [x.float() for x in [tokens, features]]

        with torch.no_grad():
            logit = model(tokens, features, inputs, attention_mask, lens)
            pred = torch.sigmoid(logit).detach().cpu().numpy()
            tweets += tweet_ids
            users += user_ids
            preds.append(pred)

        nb_eval_steps += 1

        #if nb_eval_steps == 10:
        #    break

    tweets = np.array(tweets)
    users = np.array(users)
    preds = np.float64(np.vstack(preds))
    print(tweets.shape, users.shape, preds.shape)
    print(tweets[0:10])
    print(users[0:10])

    for i, engage in enumerate(["reply", "retweet", "comment", "like"]):
        preds_i = preds[:, i]
        print(preds_i.shape)
        with open(
                args.test_inference_path + "submission_{}.csv".format(engage),
                "w") as f:
            for k in range(preds_i.shape[0]):
                f.write(
                    str(tweets[k]) + "," + str(users[k]) + "," +
                    str(preds_i[k]) + "\n")
            print("Saved to csv the predictions for task {}".format(engage))
Example #26
0
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
	# Loop to handle MNLI double evaluation (matched, mis-matched)
	eval_output_dir = args.output_dir

	# MODIF FOR EVAL SCRIPT / USE CUSTOM DATASET
	#eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
	vectorizer = VectorizeParagraph(tokenizer=tokenizer,
									block_size=GPT2_BLOCK_SIZE,
									mode=VectorizeMode.TRAIN,
									use_context=True,
									select_summary=lambda input_dict: random.choice(list(input_dict.values())))

	eval_dataset = DatasetFromRepo(path=args.eval_data_file, transform=vectorizer)

	if args.local_rank in [-1, 0]:
		os.makedirs(eval_output_dir, exist_ok=True)

	args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
	# Note that DistributedSampler samples randomly

	def collate(examples: List[Tuple[torch.Tensor]]):
		all_inputs = [elt[0] for elt in examples]
		all_types = [elt[1] for elt in examples]
		all_labels = [elt[2] for elt in examples]

		padded_inputs = pad_sequence(all_inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
		padded_types = pad_sequence(all_types, batch_first=True, padding_value=tokenizer.pad_token_id)
		padded_labels = pad_sequence(all_labels, batch_first=True, padding_value=-100)

		return padded_inputs, padded_types, padded_labels

	eval_sampler = SequentialSampler(eval_dataset)
	eval_dataloader = DataLoader(
		eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
	)

	# multi-gpu evaluate
	if args.n_gpu > 1:
		model = torch.nn.DataParallel(model)

	# Eval!
	logger.info("***** Running evaluation {} *****".format(prefix))
	logger.info("  Num examples = %d", len(eval_dataset))
	logger.info("  Batch size = %d", args.eval_batch_size)
	eval_loss = 0.0
	nb_eval_steps = 0
	model.eval()

	for batch in tqdm(eval_dataloader, desc="Evaluating"):
		inputs, types, labels = batch
		inputs = inputs.to(args.device)
		types = types.to(args.device)
		labels = labels.to(args.device)

		with torch.no_grad():
			outputs = model(inputs, labels=labels, token_type_ids=types)
			lm_loss = outputs[0]
			eval_loss += lm_loss.mean().item()
		nb_eval_steps += 1

	eval_loss = eval_loss / nb_eval_steps
	perplexity = torch.exp(torch.tensor(eval_loss))

	result = {"perplexity": perplexity}

	output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
	with open(output_eval_file, "w") as writer:
		logger.info("***** Eval results {} *****".format(prefix))
		for key in sorted(result.keys()):
			logger.info("  %s = %s", key, str(result[key]))
			writer.write("%s = %s\n" % (key, str(result[key])))

	return result
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    eval_sampler = SequentialSampler(eval_dataset)

    def collate(examples: List[Dict]):
        inputs, inputs_type, labels = [], [], []
        for sample in examples:
            inputs.append(sample['inputs'])
            inputs_type.append(sample['inputs_type'])
            labels.append(sample['label'])
        labels = torch.LongTensor(labels)

        if tokenizer._pad_token is None:
            return {
                'inputs': pad_sequence(inputs, batch_first=True),
                'inputs_type': pad_sequence(inputs_type, batch_first=True),
                'label': labels
            }
        return {
            'inputs':
            pad_sequence(inputs,
                         batch_first=True,
                         padding_value=tokenizer.pad_token_id),
            'inputs_type':
            pad_sequence(inputs_type,
                         batch_first=True,
                         padding_value=tokenizer.pad_token_id),
            'label':
            labels
        }

    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    # if args.n_gpu > 1:
    #     model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    n_correct, n_recall, n_precision = [0, 0], [0, 0], [0, 0]
    score_list, label_list = [], []

    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, inputs_type, labels = batch['inputs'], batch[
            'inputs_type'], batch['label']
        inputs = inputs.to(args.device)
        inputs_type = inputs_type.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, token_type_ids=inputs_type, labels=labels)
            lm_loss, logits = outputs[0], outputs[1]

            eval_loss += lm_loss.mean().item()

            prediction = torch.max(logits.view(-1, 2),
                                   dim=-1)[1]  # TODO: magic number
            n_correct_vec = prediction.eq(labels).float()

            n_correct[0] += n_correct_vec.sum()
            n_correct[1] += prediction.size(0)
            n_recall[0] += torch.sum(n_correct_vec * labels.eq(1).float())
            n_recall[1] += labels.eq(1).float().sum()
            n_precision[0] += torch.sum(n_correct_vec *
                                        prediction.eq(1).float())
            n_precision[1] += prediction.eq(1).float().sum()

            score_list += torch.softmax(logits,
                                        dim=-1)[:, 1].detach().cpu().tolist()
            label_list += labels.detach().cpu().tolist()

        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    score_list, label_list = np.array(score_list), np.array(label_list)
    fpr, tpr, thresholds = metrics.roc_curve(label_list,
                                             score_list,
                                             pos_label=1)
    auc = metrics.auc(fpr, tpr)

    result = {
        "loss": eval_loss,
        'AUC': auc,
        'accuracy': n_correct[0] / n_correct[1],
        'recall': n_recall[0] / n_recall[1],
        'precision': n_precision[0] / n_precision[1]
    }

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="",
             debug=False) -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch_idx, batch in enumerate(tqdm(eval_dataloader,
                                           desc="Evaluating")):
        inputs, labels = mask_tokens(batch, tokenizer,
                                     args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1
        if debug and batch_idx == 10:
            break

    eval_loss = eval_loss / nb_eval_steps
    perplexity = np.exp(eval_loss)

    return eval_loss, perplexity
Example #29
0
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, global_step = None, tr_loss = None, prefix="") -> Dict: # added global_step, tr_loss -TJ
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size #* max(1, args.n_gpu) # commented -TJ 
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): # added second clause -TJ
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "a") as writer: # changed mode from w to a -TJ
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            #writer.write("%s = %s\n" % (key, str(result[key]))) # modifying what to log -TJ
            dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
            logstr ='{} Step {}: train loss = {:.3f}, valid loss = {:.3f}, valid perpl = {:.1f}\n'.format(dt_string, global_step, tr_loss, eval_loss, perplexity)
            writer.write(logstr)

    return result
Example #30
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    if args.wiki_dataset:
        collate_fn = functools.partial(collate_wiki, tokenizer)
    else:
        collate_fn = functools.partial(collate, tokenizer)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset,
        sampler=eval_sampler,
        batch_size=args.eval_batch_size,
        collate_fn=collate_fn,
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader,
                      desc="Evaluating",
                      unit_scale=args.eval_batch_size,
                      unit="examples"):
        if args.eval_subsampling != 1.0 and random.random(
        ) >= args.eval_subsampling:
            continue

        if args.wiki_dataset:
            if args.mlm:
                raise RuntimeError("Can't do mlm for wiki dataset")

            tokens, loss_mask = batch
            inputs, labels = (tokens, tokens)

            loss_mask = loss_mask.to(args.device)
            loss_weights = (~loss_mask) + loss_mask * args.title_scale
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            outputs = model(inputs, labels=labels, loss_weights=loss_weights)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        else:
            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)

            with torch.no_grad():
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()

        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))
    loss = torch.tensor(eval_loss)

    result = {"perplexity": perplexity, "loss": loss}

    if args.eval_creativity_blacklist:
        if not args.parsed_dictionary_dataset:
            raise RuntimeError(
                "Evaluating creativity blacklist with non-parsed dictionary dataset"
            )

        blacklist = datasets.Blacklist.load(args.eval_creativity_blacklist)

        print(
            f"Evaluating creativity over {args.num_eval_creativity} words with {args.eval_creativity_batch_size} batch size"
        )
        s = time.time()
        result.update(
            datasets.ParsedDictionaryDefinitionDataset.evaluate_creativity(
                tokenizer,
                model,
                blacklist,
                args.num_eval_creativity,
                args.eval_creativity_batch_size,
                max_length=args.block_size,
            ))
        print(f"Done evaluating creativity in {time.time() - s}s")

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result