def __call__(self, item): texts = item["text"] if not isinstance(texts, list): texts = [texts] processed = [] for idx, text in enumerate(texts): sample = Sample() processed_text = super().__call__({"text": text}) sample.update(processed_text) sample.segment_ids.fill_(idx) processed.append(sample) # Use SampleList to convert list of tensors to stacked tensors processed = SampleList(processed) processed.input_ids = processed.input_ids.view(-1) processed.input_mask = processed.input_mask.view(-1) processed.segment_ids = processed.segment_ids.view(-1) return processed.to_dict()
def __call__(self, item: Dict[str, Any]): texts = item["text"] if not isinstance(texts, list): texts = [texts] processed = [] for idx, text in enumerate(texts): sample = Sample() processed_text = self.tokenizer({"text": text}) sample.update(processed_text) sample.segment_ids.fill_(idx) processed.append(sample) # Use SampleList to convert list of tensors to stacked tensors processed = SampleList(processed) if self.fusion_strategy == "concat": processed.input_ids = processed.input_ids.view(-1) processed.input_mask = processed.input_mask.view(-1) processed.segment_ids = processed.segment_ids.view(-1) processed.lm_label_ids = processed.lm_label_ids.view(-1) return processed.to_dict()