def split_exists(split, data_type, data_path):
     filename = os.path.join(data_path, f'{split}.{data_type}')
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedDataset.exists(filename):
         return True
     return False
Ejemplo n.º 2
0
 def split_exists(split, src, tgt, lang):
     filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename):
         return True
     return False
Ejemplo n.º 3
0
 def split_exists(split, src, tgt, lang, data_path):
     filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     if self.args.dataset_impl == 'raw' and IndexedRawTextDataset.exists(filename):
         return True
     elif self.args.dataset_impl != 'raw' and IndexedDataset.exists(filename):
         return True
     return False
Ejemplo n.º 4
0
 def split_exists(split, data_path):
     filename = os.path.join(data_path, split)
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename):
         return True
     return False
Ejemplo n.º 5
0
 def split_exists(split, src, tgt, lang):
     filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedDataset.exists(filename):
         return True
     return False
Ejemplo n.º 6
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                ds = IndexedDataset(path, fix_lua_indexing=True)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    self.args.tokens_per_sample,
                    pad=self.dictionary.pad(),
                    eos=self.dictionary.eos(),
                    break_mode=self.args.sample_break_mode,
                    include_targets=True,
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset,
            sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
        )
Ejemplo n.º 7
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    BlockDataset(
                        tokens,
                        ds.sizes,
                        self.args.tokens_per_sample,
                    ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        if split == 'valid':
            fix_seed = True
        else:
            fix_seed = False

        self.datasets[split] = BertDataset(
            dataset, sizes, self.dictionary, self.args.shuffle_instance,
            self.seed, fix_seed, self.args.token_mask_ratio,
            self.args.token_noise_prob, self.args.token_clean_prob,
            self.args.sent_pos_mask_ratio, self.args.sent_pos_noise_prob,
            self.args.sent_pos_clean_prob)
Ejemplo n.º 8
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []
        loaded_labels = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedDataset(path, fix_lua_indexing=False)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    0,
                    pad=self.dictionary.pad(),
                    break_mode='eos',
                    include_targets=False,
                ))

            with open(path + '.lbl', 'r') as lbl_f:
                lines = lbl_f.readlines()
                loaded_labels.extend(int(l) for l in lines)

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = SentenceClassificationDataset(
            dataset,
            loaded_labels,
            sizes,
            self.dictionary,
        )
Ejemplo n.º 9
0
 def split_para_exists(split, key, lang):
     filename = os.path.join(self.args.data, '{}.{}.{}'.format(split, key, lang))
     print(filename); print(self.args.raw_text)
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedDataset.exists(filename):
         return True
     return False
Ejemplo n.º 10
0
    def load_dataset(self, split, combine=False):
        """Load a dataset split."""

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            cbt_booktitle_idx = None
            if self.args.sample_break_mode == 'cbt_booktitle':
                if self.dictionary.index(
                        '_BOOK_TITLE_') != self.dictionary.unk():
                    cbt_booktitle_idx = self.dictionary.index('_BOOK_TITLE_')

            loaded_datasets.append(
                TokenBlockDataset(
                    tokens,
                    ds.sizes,
                    self.args.tokens_per_sample,
                    self.args.sample_break_mode,
                    include_targets=True,
                    cbt_booktitle_idx=cbt_booktitle_idx,
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = MonolingualDataset(dataset,
                                                  sizes,
                                                  self.dictionary,
                                                  shuffle=False)
Ejemplo n.º 11
0
 def split_exists(split, src, tgt, lang, data_path):
     if src is not None:
         filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     else:
         filename = os.path.join(data_path, '{}.{}-None.{}'.format(split, src, tgt))
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedDataset.exists(filename):
         return True
     return False
Ejemplo n.º 12
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.
        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        dataset_map = OrderedDict()

        for lang in self.langs2id.keys():
            if self.default_key is None:
                self.default_key = lang
            # Datasets are expected to be in "split.lang" format (Eg: train.en)
            language_split = '{}.{}'.format(split, lang)
            path = os.path.join(self.args.data, language_split)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                raise FileNotFoundError('Dataset not found: {} ({})'.format(
                    language_split, self.args.data))

            # Since we append each block with the classification_token,
            # we need to effectively create blocks of length
            # tokens_per_sample-1
            block_dataset = TokenBlockDataset(
                dataset=ds,
                sizes=ds.sizes,
                block_size=self.args.tokens_per_sample - 1,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos())

            dataset_map[lang] = MaskedLMDataset(
                dataset=block_dataset,
                sizes=block_dataset.sizes,
                vocab=self.dictionary,
                pad_idx=self.dictionary.pad(),
                mask_idx=self.dictionary.mask(),
                classif_token_idx=self.dictionary.eos(),
                sep_token_idx=self.dictionary.eos(),
                shuffle=getattr(self.args, 'shuffle', False),
                has_pairs=False,
                segment_id=self.langs2id[lang],
                seed=self.seed,
            )

        self.datasets[split] = MultiCorpusSampledDataset(
            dataset_map, default_key=self.default_key)
        print('| {} {} {} examples'.format(self.args.data, split,
                                           len(self.datasets[split])))
Ejemplo n.º 13
0
    def load_dataset_ordering(self, input_ordered_file, input_shuffled_file):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        assert self.args.raw_text and IndexedRawTextDataset.exists(
            input_shuffled_file)
        ds = IndexedRawTextDataset(input_shuffled_file, self.dictionary)
        tokens = [t for l in ds.tokens_list for t in l]

        loaded_datasets.append(
            TokenBlockDataset(
                tokens,
                ds.sizes,
                self.args.tokens_per_sample,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos(),
                break_mode=self.args.sample_break_mode,
                include_targets=True,
            ))

        print('| {} {} examples'.format(input_shuffled_file,
                                        len(loaded_datasets[-1])))

        # if not combine:
        #     break

        assert len(loaded_datasets) == 1
        dataset = loaded_datasets[0]
        sizes = dataset.sizes

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets['test'] = MonolingualDataset(
            dataset,
            sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=False,
            targets=self.targets,
        )
Ejemplo n.º 14
0
    def load_dataset(self, split):
        """Load a dataset split."""
        path = os.path.join(self.args.data, split)
        if self.args.raw_text and IndexedRawTextDataset.exists(path):
            ds = IndexedRawTextDataset(path, self.dictionary)
            tokens = ds.tokens_list
        elif not self.args.raw_text and IndexedInMemoryDataset.exists(path):
            ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
            tokens = ds.buffer
        else:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data))

        dataset = TokenBlockDataset(
            tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode,
            include_targets=True,  # return next tokens as targets
        )
        self.datasets[split] = MonolingualDataset(dataset, dataset.sizes, self.dictionary, shuffle=False)
Ejemplo n.º 15
0
    def _load_single_lang_dataset(self, split):
        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            # Since we append each block with the classification_token,
            # we need to effectively create blocks of length
            # tokens_per_sample-1
            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    ds.sizes,
                    self.args.tokens_per_sample - 1,
                    pad=self.dictionary.pad(),
                    eos=self.dictionary.eos(),
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        return dataset, sizes
Ejemplo n.º 16
0
    def load_dataset(self, split, combine=False):
        """
        Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    BlockPairDataset(
                        ds,
                        self.dictionary,
                        ds.sizes,
                        self.args.tokens_per_sample,
                        break_mode=self.args.break_mode,
                    ))

            logger.info('{} {} {} examples'.format(self.args.data, split_k,
                                                   len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = MaskedLMDataset(
            dataset=dataset,
            sizes=sizes,
            vocab=self.dictionary,
            pad_idx=self.dictionary.pad(),
            mask_idx=self.dictionary.mask(),
            classif_token_idx=self.dictionary.cls(),
            sep_token_idx=self.dictionary.sep(),
            shuffle=False,
            seed=self.seed,
        )
Ejemplo n.º 17
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    ModifiedBlockPairDataset(
                        tokens,
                        ds.sizes,
                        self.args.tokens_per_sample,
                        pad=self.dictionary.pad(),
                        class_positive=self.dictionary.class_positive(),
                        class_negative=self.dictionary.class_negative(),
                        sep=self.dictionary.sep(),
                        vocab=self.dictionary,
                        break_mode=self.args.break_mode,
                        short_seq_prob=self.args.short_seq_prob,
                    ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = ModifiedBertDataset(
            dataset,
            sizes,
            self.dictionary,
            shuffle=self.args.shuffle_instance,
            seed=self.seed,
            mask_ratio=self.args.mask_ratio,
            lower=self.args.span_lower,
            upper=self.args.span_upper,
            geometric_p=self.args.geometric_p)
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = [[], []]
        loaded_labels = []
        stop = False

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            base_path = os.path.join(self.args.data, split_k)
            path1 = os.path.join(base_path + '_s1')
            path2 = os.path.join(base_path + '_s2')

            for path, datasets in zip([path1, path2], loaded_datasets):
                if self.args.raw_text and IndexedRawTextDataset.exists(path):
                    ds = IndexedRawTextDataset(path, self.dictionary)
                elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                        path):
                    ds = IndexedDataset(path, fix_lua_indexing=False)
                else:
                    if k > 0:
                        stop = True
                        break
                    else:
                        raise FileNotFoundError(
                            'Dataset not found: {} ({})'.format(
                                split, self.args.data))

                datasets.append(
                    TokenBlockDataset(
                        ds,
                        0,
                        pad=self.dictionary.pad(),
                        break_mode='eos',
                        include_targets=False,
                    ))

            if stop:
                break
            with open(base_path + '.lbl', 'r') as lbl_f:
                lines = lbl_f.readlines()
                cast = int if self.num_labels > 1 else float
                loaded_labels.extend(cast(l.rstrip()) for l in lines)

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[0][-1])))

            if not combine:
                break

        if self.num_labels == 2:
            loaded_labels = [l if l == 1 else 0 for l in loaded_labels]

        if len(loaded_datasets[0]) == 1:
            dataset1 = loaded_datasets[0][0]
            dataset2 = loaded_datasets[1][0]
            sizes1 = dataset1.sizes
            sizes2 = dataset2.sizes
        else:
            dataset1 = ConcatDataset(loaded_datasets[0])
            dataset2 = ConcatDataset(loaded_datasets[1])
            sizes1 = np.concatenate([ds.sizes for ds in loaded_datasets[0]])
            sizes2 = np.concatenate([ds.sizes for ds in loaded_datasets[1]])
        self.datasets[split] = SentencePairClassificationDataset(
            dataset1, dataset2, loaded_labels, sizes1, sizes2, self.dictionary)
Ejemplo n.º 19
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            tag_map = None
            if self.args.tag_bitmap_file_prefix is not None:
                print("self.args.tag_bitmap_file_prefix is not None")
                tag_map = bitarray()
                tag_map.fromfile(
                    open(self.args.tag_bitmap_file_prefix + split, 'rb'))

            block_cls = BlockPairDataset if not self.no_nsp else BlockDataset
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    block_cls(tokens,
                              ds.sizes,
                              self.args.tokens_per_sample,
                              pad=self.dictionary.pad(),
                              cls=self.dictionary.cls(),
                              mask=self.dictionary.mask(),
                              sep=self.dictionary.sep(),
                              break_mode=self.args.break_mode,
                              short_seq_prob=self.short_seq_prob,
                              tag_map=tag_map))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break
        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
        dataset_cls = SpanBertDataset if not self.no_nsp else NoNSPSpanBertDataset
        self.datasets[split] = dataset_cls(dataset,
                                           sizes,
                                           self.dictionary,
                                           shuffle=self.args.shuffle_instance,
                                           seed=self.seed,
                                           args=self.args)