def collate(data, tokenizer, block_size): """ List of tuple as an input. """ # remove the files with empty an story/summary, encode and fit to block data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data) data = [ encode_for_summarization(story, summary, tokenizer) for story, summary in data ] data = [( fit_to_block_size(story, block_size, tokenizer.pad_token_id), fit_to_block_size(summary, block_size, tokenizer.pad_token_id), ) for story, summary in data] stories = torch.tensor([story for story, summary in data]) summaries = torch.tensor([summary for story, summary in data]) encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id) encoder_mask = build_mask(stories, tokenizer.pad_token_id) decoder_mask = build_mask(summaries, tokenizer.pad_token_id) lm_labels = build_lm_labels(summaries, tokenizer.pad_token_id) return ( stories, summaries, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels, )
def collate(data, tokenizer, block_size, device): """ Collate formats the data passed to the data loader. In particular we tokenize the data batch after batch to avoid keeping them all in memory. We output the data as a namedtuple to fit the original BertAbs's API. """ data = [x for x in data if not len(x[1]) == 0] # remove empty_files names = [name for name, _, _ in data] summaries = [" ".join(summary_list) for _, _, summary_list in data] encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data] encoded_stories = torch.tensor( [truncate_or_pad(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text] ) encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id) encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id) batch = Batch( document_names=names, batch_size=len(encoded_stories), src=encoded_stories.to(device), segs=encoder_token_type_ids.to(device), mask_src=encoder_mask.to(device), tgt_str=summaries, ) return batch