Esempio n. 1
0
def create_dataloader(args, dataset, tokenizer):
    """
    Function to create a PyTorch Dataloader from a given dataset.
    Inputs:
        args - Namespace object from the argument parser
        dataset - Dataset to convert to Dataloader
        tokenizer - BERT tokenizer instance
    Outputs:
        dataset - DataLoader object of the dataset
    """

    # create a data collator function
    data_collator = DataCollatorWithPadding(tokenizer)

    # create the dataloader
    dataset = DataLoader(
        dataset,
        batch_size=args.batch_size,
        collate_fn=data_collator,
        drop_last=False,
        shuffle=True,
    )

    # return the dataset
    return dataset
Esempio n. 2
0
    def __init__(self,
                 args,
                 model,
                 tokenizer,
                 train_dataset,
                 eval_dataset,
                 data_collator=None,
                 compute_metrics=None,
                 rank=0,
                 world_size=1):

        self.args = args
        self.model_name_or_path = ""
        self.world_size = world_size  # torch.distributed3.get_world_size()
        self.checkpoint_dir = args.output_dir

        self.label_names = ["labels"]

        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(
            tokenizer)

        self.data_collator = data_collator if data_collator is not None else default_collator

        # Mixed precision setup
        self.use_apex = False
        self.use_amp = False
        self.fp16_backend = None

        self.compute_metrics = compute_metrics

        self.gradient_accumulation_steps = args.gradient_accumulation_steps
        self.num_train_epochs = int(args.num_train_epochs)

        self.logging_steps = args.logging_steps
        self.save_steps = args.save_steps
        self.eval_steps = args.eval_steps

        self.weight_decay = args.weight_decay
        self.learning_rate = args.learning_rate
        self.layerwise_lr_decay = 1
        self.adam_epsilon = args.adam_epsilon
        self.max_grad_norm = args.max_grad_norm

        self.device = args.device
        model = model.to(args.device)

        self.model = model

        self.tokenizer = tokenizer
        self.optimizer = self._build_optimizer()
        self.rank = rank
        self.timer = Timer(builtin_keys=('wall', 'io', 'gpu', 'merge', 'cv'))

        self.evaluate_during_training = True

        self.output_hidden_states = None
        self.output_attentions = None
Esempio n. 3
0
def TaskDependentCollator(tokenizer):
    token_collator = DataCollatorForTokenClassification(tokenizer)
    default_collator = DataCollatorWithPadding(tokenizer)

    def inner(
        task_key: str, features: List[Union[InputDataClass,
                                            Dict]]) -> Dict[str, torch.Tensor]:
        if task_key == "pos":
            return token_collator(features)
        else:
            return default_collator(features)

    return inner
Esempio n. 4
0
def create_dataloader(args,
                      dataset,
                      tokenizer,
                      k_shot=False,
                      num_classes=None):
    """
    Function to create a PyTorch Dataloader from a given dataset.
    Inputs:
        args - Namespace object from the argument parser
        dataset - Dataset to convert to Dataloader
        tokenizer - BERT tokenizer instance
        k_shot - Indicates whether to make the training set k-shot. Default is False
        num_classes - Number of classes in the dataset. Default is None
    Outputs:
        dataset - DataLoader object of the dataset
    """

    # check if k-shot
    new_dataset = []
    if k_shot:
        for current_class in range(0, num_classes):
            class_set = dataset.filter(
                lambda example: example['labels'] == current_class)
            class_set = class_set.shuffle()
            class_set = class_set.filter(lambda e, i: i < args.k,
                                         with_indices=True)
            new_dataset.append(class_set)
        dataset = concatenate_datasets(new_dataset)

    # create a data collator function
    data_collator = DataCollatorWithPadding(tokenizer)

    # create the dataloader
    dataset = DataLoader(
        dataset,
        batch_size=args.batch_size,
        collate_fn=data_collator,
        drop_last=False,
        shuffle=True,
    )

    # return the dataset
    return dataset
Esempio n. 5
0
    def _compute_thresholds(self):
        confidences = []
        with torch.no_grad():
            loader = data.DataLoader(
                self.val_dataset,
                batch_size=64,
                collate_fn=DataCollatorWithPadding(self.tokenizer),
            )
            for batch in loader:
                labels = batch.pop("labels")

                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                preds = self.model(input_ids=input_ids,
                                   attention_mask=attention_mask,
                                   return_dict=True).logits
                output = F.softmax(preds, dim=1).cpu()
                max_value, max_index = torch.max(output, dim=1)
                confidences += max_value[max_index == labels].tolist()

        self.threshold_min = 0.7  # torch.tensor(confidences).mean().item()
        self.threshold_max = 1.0  # torch.tensor(confidences).max().item()
 def collate_batch(
     self, features: List[Union[InputDataClass,
                                Dict]]) -> Dict[str, torch.Tensor]:
     first = features[0]
     batch = None
     if isinstance(first, dict):
         # NLP data sets current works presents features as lists of dictionary
         # (one per example), so we  will adapt the collate_batch logic for that
         if "labels" in first and first["labels"] is not None:
             if first["labels"].dtype == torch.int64:
                 labels = torch.tensor([f["labels"] for f in features],
                                       dtype=torch.long)
             else:
                 labels = torch.tensor([f["labels"] for f in features],
                                       dtype=torch.float)
             batch = {"labels": labels}
         for k, v in first.items():
             if k != "labels" and v is not None and not isinstance(v, str):
                 batch[k] = torch.stack([f[k] for f in features])
         return batch
     else:
         # otherwise, revert to using the default collate_batch
         return DataCollatorWithPadding().collate_batch(features)
Esempio n. 7
0
dataset = load_dataset(DATASETS_CLASS_PATH,
                       data_files=ANNOTATIONS_PATH,
                       name='pickle',
                       split='train',
                       cache_dir=CACHE_DIR)

encoded_dataset = dataset.map(
    lambda examples: encode_examples(examples['context_left'], examples[
        'context_center'], examples['context_right'], tokenizer, MAX_SEQ_LEN),
    batched=True,
    remove_columns=[
        'context_center', 'context_left', 'context_right', 'document_id', 'id'
    ])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
                                        max_length=MAX_SEQ_LEN)

dataset_loader = DataLoader(encoded_dataset,
                            batch_size=BATCH_SIZE,
                            shuffle=False,
                            collate_fn=data_collator)

meta_cat = MetaCAT(save_dir=META_CAT_SAVE_DIR, tokenizer=tokenizer)
meta_cat.load()
meta_cat.model.eval()
meta_cat.model.to(DEVICE)

logits = []
for i, batch in enumerate(dataset_loader):
    batch = {k: v.to(DEVICE) for k, v in batch.items()}
    out = meta_cat.model(**batch)
 def collate_fn(self, data):
     return DataCollatorWithPadding(tokenizer=self.tokenizer)(data)