Ejemplo n.º 1
0
 def setUp(self):
     d = mock_dict()
     tokens_1 = torch.LongTensor([i for i in range(1, 5000, 2)]).view(1, -1)
     tokens_ds1 = TokenBlockDataset(
         tokens_1,
         sizes=[tokens_1.size(-1)],
         block_size=1,
         pad=0,
         eos=1,
         include_targets=False,
     )
     self.dataset_1 = LanguagePairDataset(tokens_ds1,
                                          tokens_ds1.sizes,
                                          d,
                                          shuffle=False)
     tokens_2 = torch.LongTensor([i for i in range(0, 5000, 2)]).view(1, -1)
     tokens_ds2 = TokenBlockDataset(
         tokens_2,
         sizes=[tokens_2.size(-1)],
         block_size=1,
         pad=0,
         eos=1,
         include_targets=False,
     )
     self.dataset_2 = LanguagePairDataset(tokens_ds2,
                                          tokens_ds2.sizes,
                                          d,
                                          shuffle=False)
Ejemplo n.º 2
0
    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
        assert not self.cfg.include_src or len(src_tokens[0]) == 2
        input_src = None
        if self.cfg.include_src:
            input_src = TokenBlockDataset(
                [t[0] for t in src_tokens],
                [l[0] for l in src_lengths],
                block_size=None,  # ignored for "eos" break mode
                pad=self.source_dictionary.pad(),
                eos=self.source_dictionary.eos(),
                break_mode="eos",
            )
            input_src = PrependTokenDataset(input_src, self.dictionary.bos())
            input_src = TruncateDataset(input_src, self.cfg.max_positions)

        input_tgt = TokenBlockDataset(
            [t[-1] for t in src_tokens],
            [l[-1] for l in src_lengths],
            block_size=None,  # ignored for "eos" break mode
            pad=self.source_dictionary.pad(),
            eos=self.source_dictionary.eos(),
            break_mode="eos",
        )
        input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions)
        if self.cfg.include_src:
            src_tokens = ConcatSentencesDataset(input_src, input_tgt)
            src_lengths = NumelDataset(input_src, reduce=False)
        else:
            input_tgt = PrependTokenDataset(input_tgt, self.dictionary.bos())
            src_tokens = input_tgt
            src_lengths = NumelDataset(src_tokens, reduce=False)

        dataset = {
            "id": IdDataset(),
            "net_input": {
                "src_tokens":
                RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                "src_lengths":
                src_lengths,
            },
            "nsentences": NumSamplesDataset(),
            "ntokens": NumelDataset(src_tokens, reduce=True),
        }

        return NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )
Ejemplo n.º 3
0
def get_prepended_token_block_dataset(args,
                                      dataset_path,
                                      vocab,
                                      combine=False):
    dataset = data_utils.load_indexed_dataset(
        dataset_path,
        vocab,
        args.dataset_impl,
        combine=combine,
    )

    if dataset is None:
        raise FileNotFoundError('Dataset not found: ({})'.format(dataset_path))

    if not args.apply_ptb:
        print("| [I] ptb not applied.", flush=True)
        return dataset

    dataset = TruncateDataset(dataset, args.tokens_per_sample - 1)
    dataset = TokenBlockDataset(
        dataset,
        dataset.sizes,
        args.tokens_per_sample - 1,  # one less for <s>
        pad=vocab.pad(),
        eos=vocab.eos(),
        break_mode=args.sample_break_mode,
    )
    print('| loaded {} blocks from: {}'.format(len(dataset), dataset_path),
          flush=True)

    dataset = PrependTokenDataset(dataset, vocab.bos())
    return dataset
Ejemplo n.º 4
0
def main(args):
    tokenizer = build_tokenizer(args)

    indices = []
    with open(args.input) as fp:
        for line in tqdm(fp):
            line = line.strip()
            indices.append(tokenizer.encode(line))
    print("tokenize finished.")
    for i in range(5):
        print("example[%d]:" % i)
        input_ids = indices[i]
        print(input_ids)
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        print(tokens)

    dataset = IndexDataset(indices)
    dataset = TruncateDataset(dataset, args.tokens_per_sample - 1)
    dataset = TokenBlockDataset(
        dataset,
        dataset.sizes,
        args.tokens_per_sample - 1,  # one less for <s>
        pad=tokenizer.pad_token_id,
        eos=tokenizer.sep_token_id,
        break_mode=args.sample_break_mode,
    )
    print('| loaded {} blocks from: {}'.format(len(dataset), args.input),
          flush=True)

    dataset = PrependTokenDataset(dataset, tokenizer.cls_token_id)
    print("| get all items ...")
    items = [i for i in tqdm(dataset)]
    print("| writing binary file ...")
    prefix = os.path.join(args.output, "train.0")
    save_items(items, prefix, len(tokenizer))
Ejemplo n.º 5
0
 def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
     src_dataset = PadDataset(
         TokenBlockDataset(
             src_tokens,
             src_lengths,
             self.args.tokens_per_sample - 1,  # one less for <s>
             pad=self.source_dictionary.pad(),
             eos=self.source_dictionary.eos(),
             break_mode='eos',
         ),
         pad_idx=self.source_dictionary.pad(),
         left_pad=False,
     )
     src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
     src_dataset = NestedDictionaryDataset(
         {
             'id': IdDataset(),
             'net_input': {
                 'src_tokens': src_dataset,
                 'src_lengths': NumelDataset(src_dataset, reduce=False),
             },
         },
         sizes=src_lengths,
     )
     if sort:
         src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
     return src_dataset
Ejemplo n.º 6
0
 def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
     src_dataset = RightPadDataset(
         TokenBlockDataset(
             src_tokens,
             src_lengths,
             self.args.tokens_per_sample - 1,  # one less for <s>
             pad=self.source_dictionary.pad(),
             eos=self.source_dictionary.eos(),
             break_mode="eos",
         ),
         pad_idx=self.source_dictionary.pad(),
     )
     src_dataset = PrependTokenDataset(src_dataset,
                                       self.source_dictionary.bos())
     src_dataset = NestedDictionaryDataset(
         {
             "id": IdDataset(),
             "net_input": {
                 "src_tokens": src_dataset,
                 "src_lengths": NumelDataset(src_dataset, reduce=False),
             },
         },
         sizes=src_lengths,
     )
     if sort:
         src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
     return src_dataset
Ejemplo n.º 7
0
 def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
     """
     Generate batches for inference. We assume that the input begins with a
     bos symbol (`<s>`) and ends with an eos symbol (`</s>`).
     """
     pad = self.source_dictionary.pad()
     eos = self.source_dictionary.eos()
     src_dataset = TokenBlockDataset(
         src_tokens,
         src_lengths,
         block_size=self.args.tokens_per_sample - 2,  # for <s> and </s>
         pad=pad,
         eos=eos,
         break_mode=self.args.sample_break_mode,
         document_sep_len=0,
     )
     prev_output_tokens = PrependTokenDataset(
         StripTokenDataset(src_dataset, eos), eos)
     src_dataset = PadDataset(src_dataset, pad_idx=pad, left_pad=False)
     return NestedDictionaryDataset(
         {
             "id": IdDataset(),
             "net_input": {
                 "src_tokens":
                 src_dataset,
                 "src_lengths":
                 NumelDataset(src_dataset, reduce=False),
                 "prev_output_tokens":
                 PadDataset(prev_output_tokens, pad_idx=pad,
                            left_pad=False),
             },
             "target": src_dataset,
         },
         sizes=[np.array(src_lengths)],
     )
Ejemplo n.º 8
0
    def build_dataset_for_inference(self, src_tokens: List[torch.Tensor],
                                    src_lengths: List[int],
                                    **kwargs) -> torch.utils.data.Dataset:
        eos = self.source_dictionary.eos()
        dataset = TokenBlockDataset(
            src_tokens,
            src_lengths,
            block_size=None,  # ignored for "eos" break mode
            pad=self.source_dictionary.pad(),
            eos=eos,
            break_mode="eos",
        )

        class Dataset(torch.utils.data.Dataset):
            def __getitem__(self, i):
                item = dataset[i]
                if item[-1] == eos:
                    # remove eos to support generating with a prefix
                    item = item[:-1]
                return (i, [item])

            def __len__(self):
                return len(dataset)

        return Dataset()
Ejemplo n.º 9
0
 def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
     src_dataset = RightPadDataset(
         TokenBlockDataset(
             src_tokens,
             src_lengths,
             self.args.tokens_per_sample,
             pad=self.source_dictionary.pad(),
             eos=self.source_dictionary.eos(),
             break_mode='eos',
         ),
         pad_idx=self.source_dictionary.pad(),
     )
     # remove tail
     src_dataset = RemoveTailDataset(src_dataset)
     src_dataset = NestedDictionaryDataset(
         {
             'id': IdDataset(),
             'net_input': {
                 'src_tokens': src_dataset,
                 'src_lengths': NumelDataset(src_dataset, reduce=False),
             },
         },
         sizes=src_lengths,
     )
     if sort:
         src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
     return src_dataset
Ejemplo n.º 10
0
    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)"""

        # support sharded datasets
        paths = utils.split_paths(self.cfg.data)
        assert len(paths) > 0
        data_path = paths[(epoch - 1) % len(paths)]
        split_path = os.path.join(data_path, split)

        # each element of *data* will be a tensorized line from the original
        # text dataset, similar to ``open(split_path).readlines()``
        data = data_utils.load_indexed_dataset(split_path,
                                               self.dictionary,
                                               combine=combine)
        if data is None:
            raise FileNotFoundError("Dataset not found: {} ({})".format(
                split, split_path))

        # this is similar to ``data.view(-1).split(tokens_per_sample)``
        data = TokenBlockDataset(
            data,
            data.sizes,
            block_size=self.cfg.tokens_per_sample,
            pad=None,  # unused
            eos=None,  # unused
            break_mode="none",
        )

        self.datasets[split] = TruncatedBPTTDataset(
            data=data,
            bsz_per_shard=self.cfg.batch_size,
            shard_id=self.cfg.data_parallel_rank,
            num_shards=self.cfg.data_parallel_size,
        )
Ejemplo n.º 11
0
    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        paths = self.args.data.split(':')
        assert len(paths) > 0
        data_path = paths[epoch % len(paths)]
        split_path = os.path.join(data_path, split)

        dataset = data_utils.load_indexed_dataset(
            split_path,
            self.dictionary,
            self.args.dataset_impl,
            combine=combine,
        )
        if dataset is None:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))

        dataset = TokenBlockDataset(
            dataset, dataset.sizes, self.args.tokens_per_sample,
            pad=self.dictionary.pad(), eos=self.dictionary.eos(),
            break_mode=self.args.sample_break_mode, include_targets=True,
        )

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset, dataset.sizes, self.dictionary, self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True,
            targets=self.targets, add_bos_token=self.args.add_bos_token,
        )
Ejemplo n.º 12
0
def load_mask_data(
        path,
        mydict):  #一个大列表,每个item是一个文档矩阵,矩阵里面每个item是一个node的数值  ,for token_id 和
    #print('???',path)
    #from fairseq.data.indexed_dataset import MMapIndexedDataset
    #print('???', MMapIndexedDataset(path) )
    dataset = data_utils.load_indexed_dataset(
        path,
        mydict,
        'mmap',
        combine=False,
    )
    #print(dataset.__getitem__(0),dataset.__getitem__(0).shape,len(dataset))
    dataset = TokenBlockDataset(
        dataset,
        dataset.sizes,
        512 - 1,
        pad=mydict.pad(),
        eos=mydict.eos(),
        break_mode='complete_doc',
    )
    #print(dataset.__getitem__(0),dataset.__getitem__(0).shape,len(dataset))
    dataset = PrependTokenDataset(dataset, mydict.bos())
    #print(dataset.__getitem__(0),dataset.__getitem__(0).shape,len(dataset))

    return dataset
Ejemplo n.º 13
0
    def load_sentence(self, split, sentence):
        loaded_datasets = []
        words = sentence.split(' ')
        ds = IndexedRawTextDataset(words, self.dictionary)
        loaded_datasets.append(
            TokenBlockDataset(
                ds,
                ds.sizes,
                self.args.tokens_per_sample,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos(),
                break_mode=self.args.sample_break_mode,
                include_targets=True,
            ))
        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset,
            sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
        )
Ejemplo n.º 14
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []
        loaded_labels = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedDataset(path, fix_lua_indexing=False)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    0,
                    pad=self.dictionary.pad(),
                    break_mode='eos',
                    include_targets=False,
                ))

            with open(path + '.lbl', 'r') as lbl_f:
                lines = lbl_f.readlines()
                loaded_labels.extend(int(l) for l in lines)

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = SentenceClassificationDataset(
            dataset,
            loaded_labels,
            sizes,
            self.dictionary,
        )
Ejemplo n.º 15
0
    def load_dataset(self,
                     split: str,
                     epoch=1,
                     combine=False,
                     **kwargs) -> MonolingualDataset:
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        paths = utils.split_paths(self.args.data)
        assert len(paths) > 0

        data_path = paths[(epoch - 1) % len(paths)]
        split_path = os.path.join(data_path, split)

        # each process has its own copy of the raw data (likely to be an np.memmap)
        dataset = data_utils.load_indexed_dataset(split_path,
                                                  self.dictionary,
                                                  self.args.dataset_impl,
                                                  combine=combine)
        if dataset is None:
            raise FileNotFoundError(
                f"Dataset not found: {split} ({split_path})")

        dataset = maybe_shorten_dataset(
            dataset,
            split,
            self.args.shorten_data_split_list,
            self.args.shorten_method,
            self.args.tokens_per_sample,
            self.args.seed,
        )
        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.args.tokens_per_sample,
            pad=self.dictionary.pad(),
            eos=self.dictionary.eos(),
            break_mode=self.args.sample_break_mode,
            include_targets=True,
            use_plasma_view=self.args.use_plasma_view,
            split_path=split_path,
            plasma_path=self.args.plasma_path,
        )

        add_eos_for_other_targets = (self.args.sample_break_mode is not None
                                     and self.args.sample_break_mode != "none")

        self.datasets[split] = MonolingualDataset(
            dataset=dataset,
            sizes=dataset.sizes,
            src_vocab=self.dictionary,
            tgt_vocab=self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
            add_bos_token=self.args.add_bos_token,
        )
Ejemplo n.º 16
0
    def build_dataset_for_inference(self,
                                    src_tokens,
                                    src_lengths,
                                    language="en_XX",
                                    **kwargs):
        """
        Generate batches for inference. We prepend an eos token to src_tokens
        (or bos if `--add-bos-token` is set) and we append a <pad> to target.
        This is convenient both for generation with a prefix and LM scoring.
        """
        dataset = StripTokenDataset(
            TokenBlockDataset(
                src_tokens,
                src_lengths,
                block_size=None,  # ignored for "eos" break mode
                pad=self.source_dictionary.pad(),
                eos=self.source_dictionary.eos(),
                break_mode="eos",
            ),
            # remove eos from (end of) target sequence
            self.source_dictionary.eos(),
        )

        src_lang_idx = self.dictionary.index(lang_token(language))
        src_dataset = PrependTokenDataset(
            dataset,
            token=((src_lang_idx or self.source_dictionary.bos()) if getattr(
                self.args, "add_bos_token", False) else
                   self.source_dictionary.eos()),
        )

        max_seq_len = max(src_lengths) + 1
        tgt_dataset = AppendTokenDataset(dataset,
                                         token=self.source_dictionary.pad())
        return NestedDictionaryDataset(
            {
                "id":
                IdDataset(),
                "net_input": {
                    "src_tokens":
                    PadDataset(
                        src_dataset,
                        pad_idx=self.source_dictionary.pad(),
                        left_pad=False,
                        pad_length=max_seq_len,
                    ),
                    "src_lengths":
                    NumelDataset(src_dataset, reduce=False),
                },
                "target":
                PadDataset(
                    tgt_dataset,
                    pad_idx=self.source_dictionary.pad(),
                    left_pad=False,
                    pad_length=max_seq_len,
                ),
            },
            sizes=[np.array(src_lengths)],
        )
    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        use_ctx_dataset = getattr(self.vqvae_args, 'use_context_dataset', 0)
        paths = self.vqvae_args.data.split(":")
        assert len(paths) > 0

        data_path = paths[epoch % len(paths)]
        split_path = os.path.join(data_path, split)

        dataset = data_utils.load_indexed_dataset(
            split_path, self.dictionary, self.vqvae_args.dataset_impl, combine=combine
        )
        if dataset is None:
            raise FileNotFoundError(
                "Dataset not found: {} ({})".format(split, split_path)
            )

        if use_ctx_dataset:
            dataset = DocBlockDataset(
                dataset,
                dataset.sizes,
                self.vqvae_args.tokens_per_sample,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos(),
                break_mode=self.vqvae_args.sample_break_mode,
                include_targets=True,
                context_mode=self.vqvae_args.context_mode,
                window_size=self.vqvae_args.window_size,
            )
        else:
            dataset = TokenBlockDataset(
                dataset,
                dataset.sizes,
                self.vqvae_args.tokens_per_sample,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos(),
                break_mode=self.vqvae_args.sample_break_mode,
                include_targets=True,
            )

        add_eos_for_other_targets = (
                self.vqvae_args.sample_break_mode is not None
                and self.vqvae_args.sample_break_mode != "none"
        )

        self.datasets[split] = MonolingualDataset(
            dataset,
            dataset.sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
            add_bos_token=self.vqvae_args.add_bos_token,
        )
Ejemplo n.º 18
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                ds = IndexedDataset(path, fix_lua_indexing=True)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    self.args.tokens_per_sample,
                    pad=self.dictionary.pad(),
                    eos=self.dictionary.eos(),
                    break_mode=self.args.sample_break_mode,
                    include_targets=True,
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset,
            sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
        )
Ejemplo n.º 19
0
    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        paths = utils.split_paths(self.args.data)
        assert len(paths) > 0
        data_path = paths[(epoch - 1) % len(paths)]
        split_path = os.path.join(data_path, split)

        dataset = data_utils.load_indexed_dataset(
            split_path,
            self.dictionary,
            self.args.dataset_impl,
            combine=combine,
        )
        if dataset is None:
            raise FileNotFoundError("Dataset not found: {} ({})".format(
                split, split_path))

        dataset = StripTokenDataset(dataset, self.dictionary.eos())

        # create continuous blocks of tokens
        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.args.tokens_per_sample -
            2,  # one less for <s> and one for </s>
            pad=self.dictionary.pad(),
            eos=self.dictionary.eos(),
            break_mode=self.args.sample_break_mode,
            document_sep_len=0,
        )

        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
        dataset = AppendTokenDataset(dataset, self.source_dictionary.eos())

        mask_whole_words = (get_whole_word_mask(self.args,
                                                self.source_dictionary)
                            if self.args.mask_length != "subword" else None)

        self.datasets[split] = DenoisingDataset(
            dataset,
            dataset.sizes,
            self.dictionary,
            self.mask_idx,
            mask_whole_words,
            shuffle=self.args.shuffle_instance,
            seed=self.seed,
            args=self.args,
        )
        logger.info(
            "Split: {0}, Loaded {1} samples of denoising_dataset".format(
                split,
                len(self.datasets[split]),
            ))
Ejemplo n.º 20
0
    def load_dataset(self, split, combine=False):
        """Load a dataset split."""

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            cbt_booktitle_idx = None
            if self.args.sample_break_mode == 'cbt_booktitle':
                if self.dictionary.index(
                        '_BOOK_TITLE_') != self.dictionary.unk():
                    cbt_booktitle_idx = self.dictionary.index('_BOOK_TITLE_')

            loaded_datasets.append(
                TokenBlockDataset(
                    tokens,
                    ds.sizes,
                    self.args.tokens_per_sample,
                    self.args.sample_break_mode,
                    include_targets=True,
                    cbt_booktitle_idx=cbt_booktitle_idx,
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = MonolingualDataset(dataset,
                                                  sizes,
                                                  self.dictionary,
                                                  shuffle=False)
Ejemplo n.º 21
0
    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        paths = utils.split_paths(self.args.data)
        assert len(paths) > 0

        data_path = paths[(epoch - 1) % len(paths)]
        split_path = os.path.join(data_path, split)

        dataset = data_utils.load_indexed_dataset(
            split_path, self.dictionary, self.args.dataset_impl, combine=combine
        )
        if dataset is None:
            raise FileNotFoundError(
                "Dataset not found: {} ({})".format(split, split_path)
            )

        dataset = maybe_shorten_dataset(
            dataset,
            split,
            self.args.shorten_data_split_list,
            self.args.shorten_method,
            self.args.tokens_per_sample,
            self.args.seed,
        )

        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.args.tokens_per_sample,
            pad=self.dictionary.pad(),
            eos=self.dictionary.eos(),
            break_mode=self.args.sample_break_mode,
            include_targets=True,
        )

        add_eos_for_other_targets = (
            self.args.sample_break_mode is not None
            and self.args.sample_break_mode != "none"
        )

        self.datasets[split] = self._initialize_dataset(
            dataset=dataset,
            sizes=dataset.sizes,
            src_vocab=self.dictionary,
            tgt_vocab=self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
            add_bos_token=self.args.add_bos_token,
        )
Ejemplo n.º 22
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.
        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        dataset_map = OrderedDict()

        for lang in self.langs2id.keys():
            if self.default_key is None:
                self.default_key = lang
            # Datasets are expected to be in "split.lang" format (Eg: train.en)
            language_split = '{}.{}'.format(split, lang)
            path = os.path.join(self.args.data, language_split)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                raise FileNotFoundError('Dataset not found: {} ({})'.format(
                    language_split, self.args.data))

            # Since we append each block with the classification_token,
            # we need to effectively create blocks of length
            # tokens_per_sample-1
            block_dataset = TokenBlockDataset(
                dataset=ds,
                sizes=ds.sizes,
                block_size=self.args.tokens_per_sample - 1,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos())

            dataset_map[lang] = MaskedLMDataset(
                dataset=block_dataset,
                sizes=block_dataset.sizes,
                vocab=self.dictionary,
                pad_idx=self.dictionary.pad(),
                mask_idx=self.dictionary.mask(),
                classif_token_idx=self.dictionary.eos(),
                sep_token_idx=self.dictionary.eos(),
                shuffle=getattr(self.args, 'shuffle', False),
                has_pairs=False,
                segment_id=self.langs2id[lang],
                seed=self.seed,
            )

        self.datasets[split] = MultiCorpusSampledDataset(
            dataset_map, default_key=self.default_key)
        print('| {} {} {} examples'.format(self.args.data, split,
                                           len(self.datasets[split])))
Ejemplo n.º 23
0
    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        paths = utils.split_paths(self.args.data)
        assert len(paths) > 0
        data_path = paths[epoch % len(paths)]
        split_path = os.path.join(data_path, split)

        dataset = data_utils.load_indexed_dataset(
            split_path,
            self.dictionary,
            self.args.dataset_impl,
            combine=combine,
        )
        if dataset is None:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(
                split, split_path))

        dataset = maybe_shorten_dataset(
            dataset,
            split,
            self.args.shorten_data_split_list,
            self.args.shorten_method,
            self.args.tokens_per_sample,
            self.args.seed,
        )

        # create continuous blocks of tokens.  block_size=511或者512
        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.args.tokens_per_sample,
            pad=self.source_dictionary.pad(),
            eos=self.source_dictionary.eos(),
            break_mode=self.args.sample_break_mode,
        )
        logger.info('loaded {} blocks from: {}'.format(len(dataset),
                                                       split_path))
        s2s_dataset = MaskedLanguagePairDataset.apply_mask(
            dataset,
            dataset.sizes,
            self.source_dictionary,
            shuffle=True,
            mask_prob=self.args.mask_prob,
            leave_unmasked_prob=self.args.leave_unmasked_prob,
            random_token_prob=self.args.random_token_prob,
        )
        self.datasets[split] = s2s_dataset
Ejemplo n.º 24
0
    def _load_single_lang_dataset(self, split, epoch):
        loaded_datasets = []

        paths = self.args.data.split(":")
        assert len(paths) > 0
        data_path = paths[epoch % len(paths)]

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else "")
            path = os.path.join(data_path, split_k)

            ds = data_utils.load_indexed_dataset(
                path, self.dictionary, self.args.dataset_impl
            )
            if ds is None:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        "Dataset not found: {} ({})".format(split, data_path)
                    )

            # Since we append each block with the classification_token,
            # we need to effectively create blocks of length
            # tokens_per_sample-1
            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    ds.sizes,
                    self.args.tokens_per_sample - 1,
                    pad=self.dictionary.pad(),
                    eos=self.dictionary.eos(),
                )
            )

            print(
                "| {} {} {} examples".format(
                    data_path, split_k, len(loaded_datasets[-1])
                )
            )

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        return dataset, sizes
Ejemplo n.º 25
0
    def load_dataset_ordering(self, input_ordered_file, input_shuffled_file):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        assert self.args.raw_text and IndexedRawTextDataset.exists(
            input_shuffled_file)
        ds = IndexedRawTextDataset(input_shuffled_file, self.dictionary)
        tokens = [t for l in ds.tokens_list for t in l]

        loaded_datasets.append(
            TokenBlockDataset(
                tokens,
                ds.sizes,
                self.args.tokens_per_sample,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos(),
                break_mode=self.args.sample_break_mode,
                include_targets=True,
            ))

        print('| {} {} examples'.format(input_shuffled_file,
                                        len(loaded_datasets[-1])))

        # if not combine:
        #     break

        assert len(loaded_datasets) == 1
        dataset = loaded_datasets[0]
        sizes = dataset.sizes

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets['test'] = MonolingualDataset(
            dataset,
            sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=False,
            targets=self.targets,
        )
Ejemplo n.º 26
0
    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        print("This is the split", split)

        from fairseq.data.cvit.utils import monoling_select
        dataset = monoling_select(self.data['corpora'], split)

        from ilmulti.sentencepiece import SentencePieceTokenizer

        hard_code_dict = self.data['hard_coded_dict']

        tokenizer = SentencePieceTokenizer(hard_code_dict)
        dataset = CVITIndexedRawTextDataset(dataset, tokenizer,
                                            self.dictionary)

        if dataset is None:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(
                split, split_path))

        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.args.tokens_per_sample,
            pad=self.dictionary.pad(),
            eos=self.dictionary.eos(),
            break_mode=self.args.sample_break_mode,
            include_targets=True,
        )

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset,
            dataset.sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
            add_bos_token=self.args.add_bos_token,
        )
Ejemplo n.º 27
0
    def load_dataset(self, split):
        """Load a dataset split."""
        path = os.path.join(self.args.data, split)
        if self.args.raw_text and IndexedRawTextDataset.exists(path):
            ds = IndexedRawTextDataset(path, self.dictionary)
            tokens = ds.tokens_list
        elif not self.args.raw_text and IndexedInMemoryDataset.exists(path):
            ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
            tokens = ds.buffer
        else:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data))

        dataset = TokenBlockDataset(
            tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode,
            include_targets=True,  # return next tokens as targets
        )
        self.datasets[split] = MonolingualDataset(dataset, dataset.sizes, self.dictionary, shuffle=False)
Ejemplo n.º 28
0
    def _load_dataset_split(self, split, epoch, combine):
        paths = utils.split_paths(self.cfg.data)
        assert len(paths) > 0
        data_path = paths[(epoch - 1) % len(paths)]
        split_path = os.path.join(data_path, split)

        dataset = data_utils.load_indexed_dataset(
            split_path,
            self.dictionary,
            self.cfg.dataset_impl,
            combine=combine,
        )
        if dataset is None:
            raise FileNotFoundError(
                "Dataset not found: {} ({})".format(split, split_path)
            )

        dataset = StripTokenDataset(dataset, self.dictionary.eos())

        dataset = maybe_shorten_dataset(
            dataset,
            split,
            self.cfg.shorten_data_split_list,
            self.cfg.shorten_method,
            self.cfg.tokens_per_sample,
            self.cfg.seed,
        )

        # create continuous blocks of tokens
        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.cfg.tokens_per_sample - 2,
            # one less for <s> and one for </s>
            pad=self.dictionary.pad(),
            eos=self.dictionary.eos(),
            break_mode=self.cfg.sample_break_mode,
            document_sep_len=0,
        )
        logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))

        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
        dataset = AppendTokenDataset(dataset, self.source_dictionary.eos())
        return dataset
Ejemplo n.º 29
0
    def _load_single_lang_dataset(self, split):
        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            # Since we append each block with the classification_token,
            # we need to effectively create blocks of length
            # tokens_per_sample-1
            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    ds.sizes,
                    self.args.tokens_per_sample - 1,
                    pad=self.dictionary.pad(),
                    eos=self.dictionary.eos(),
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        return dataset, sizes
Ejemplo n.º 30
0
    def build_s2s_dataset(self, dataset):
        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.args.tokens_per_sample,
            pad=self.source_dictionary.pad(),
            eos=self.source_dictionary.eos(),
            break_mode=self.args.sample_break_mode,
        )
        
        pred_probs = torch.FloatTensor([float(x) for x in self.args.mask_s2s_mask_keep_rand.split(',')])

        s2s_dataset = MaskedLanguagePairDataset(
            dataset, dataset.sizes, self.source_dictionary,
            shuffle=True, mask_prob=self.args.mask_s2s_prob,
            pred_probs=pred_probs,
        )
        return s2s_dataset