Python IndexedRawTextDataset Exemples, fairseq.data.IndexedRawTextDataset Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : summerization.py Projet : malikaltakrori/qags-1

 def indexed_dataset(path, dictionary):
     if self.args.raw_text and IndexedRawTextDataset.exists(path):
         return IndexedRawTextDataset(path, dictionary)
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(
             path):
         return IndexedDataset(path, fix_lua_indexing=False)
     return None

Exemple #2

0

Afficher le fichier

    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    BlockDataset(
                        tokens,
                        ds.sizes,
                        self.args.tokens_per_sample,
                    ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        if split == 'valid':
            fix_seed = True
        else:
            fix_seed = False

        self.datasets[split] = BertDataset(
            dataset, sizes, self.dictionary, self.args.shuffle_instance,
            self.seed, fix_seed, self.args.token_mask_ratio,
            self.args.token_noise_prob, self.args.token_clean_prob,
            self.args.sent_pos_mask_ratio, self.args.sent_pos_noise_prob,
            self.args.sent_pos_clean_prob)

Exemple #3

0

Afficher le fichier

Fichier : language_modeling.py Projet : xuehaouwa/fairseq

    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                ds = IndexedDataset(path, fix_lua_indexing=True)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    self.args.tokens_per_sample,
                    pad=self.dictionary.pad(),
                    eos=self.dictionary.eos(),
                    break_mode=self.args.sample_break_mode,
                    include_targets=True,
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset,
            sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
        )

Exemple #4

0

Afficher le fichier

    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []
        loaded_labels = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedDataset(path, fix_lua_indexing=False)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    0,
                    pad=self.dictionary.pad(),
                    break_mode='eos',
                    include_targets=False,
                ))

            with open(path + '.lbl', 'r') as lbl_f:
                lines = lbl_f.readlines()
                loaded_labels.extend(int(l) for l in lines)

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = SentenceClassificationDataset(
            dataset,
            loaded_labels,
            sizes,
            self.dictionary,
        )

Exemple #5

0

Afficher le fichier

    def load_dataset(self, split, combine=False):
        """Load a dataset split."""

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            cbt_booktitle_idx = None
            if self.args.sample_break_mode == 'cbt_booktitle':
                if self.dictionary.index(
                        '_BOOK_TITLE_') != self.dictionary.unk():
                    cbt_booktitle_idx = self.dictionary.index('_BOOK_TITLE_')

            loaded_datasets.append(
                TokenBlockDataset(
                    tokens,
                    ds.sizes,
                    self.args.tokens_per_sample,
                    self.args.sample_break_mode,
                    include_targets=True,
                    cbt_booktitle_idx=cbt_booktitle_idx,
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = MonolingualDataset(dataset,
                                                  sizes,
                                                  self.dictionary,
                                                  shuffle=False)

Exemple #6

0

Afficher le fichier

Fichier : cross_lingual_lm.py Projet : ictnlp/DiverseNMT

    def load_dataset(self, split, combine=False):
        """Load a given dataset split.
        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        dataset_map = OrderedDict()

        for lang in self.langs2id.keys():
            if self.default_key is None:
                self.default_key = lang
            # Datasets are expected to be in "split.lang" format (Eg: train.en)
            language_split = '{}.{}'.format(split, lang)
            path = os.path.join(self.args.data, language_split)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                raise FileNotFoundError('Dataset not found: {} ({})'.format(
                    language_split, self.args.data))

            # Since we append each block with the classification_token,
            # we need to effectively create blocks of length
            # tokens_per_sample-1
            block_dataset = TokenBlockDataset(
                dataset=ds,
                sizes=ds.sizes,
                block_size=self.args.tokens_per_sample - 1,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos())

            dataset_map[lang] = MaskedLMDataset(
                dataset=block_dataset,
                sizes=block_dataset.sizes,
                vocab=self.dictionary,
                pad_idx=self.dictionary.pad(),
                mask_idx=self.dictionary.mask(),
                classif_token_idx=self.dictionary.eos(),
                sep_token_idx=self.dictionary.eos(),
                shuffle=getattr(self.args, 'shuffle', False),
                has_pairs=False,
                segment_id=self.langs2id[lang],
                seed=self.seed,
            )

        self.datasets[split] = MultiCorpusSampledDataset(
            dataset_map, default_key=self.default_key)
        print('| {} {} {} examples'.format(self.args.data, split,
                                           len(self.datasets[split])))

Exemple #7

0

Afficher le fichier

Fichier : translation.py Projet : fyabc/fairseq

 def split_exists(split, src, tgt, lang):
     filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename):
         return True
     return False

Exemple #8

0

Afficher le fichier

Fichier : glue.py Projet : zhuohan123/macaron-net

 def split_exists(split, data_path):
     filename = os.path.join(data_path, split)
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename):
         return True
     return False

Exemple #9

0

Afficher le fichier

 def split_exists(split, src, tgt, lang, data_path):
     filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     if self.args.dataset_impl == 'raw' and IndexedRawTextDataset.exists(filename):
         return True
     elif self.args.dataset_impl != 'raw' and IndexedDataset.exists(filename):
         return True
     return False

Exemple #10

0

Afficher le fichier

Fichier : translation.py Projet : hmc-cs-mdrissi/fairseq

 def indexed_dataset(path, dictionary):
     if self.args.raw_text:
         tokenizer_tool = tokenizer.build_tokenizer(self.args)
         return IndexedRawTextDataset(tokenizer_tool, path, dictionary)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path, fix_lua_indexing=True)
     return None

Exemple #11

0

Afficher le fichier

    def load_sentence(self, split, sentence):
        loaded_datasets = []
        words = sentence.split(' ')
        ds = IndexedRawTextDataset(words, self.dictionary)
        loaded_datasets.append(
            TokenBlockDataset(
                ds,
                ds.sizes,
                self.args.tokens_per_sample,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos(),
                break_mode=self.args.sample_break_mode,
                include_targets=True,
            ))
        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset,
            sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
        )

Exemple #12

0

Afficher le fichier

 def split_exists(split, src, tgt, lang):
     filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedDataset.exists(filename):
         return True
     return False

Exemple #13

0

Afficher le fichier

Fichier : fairseq_classification.py Projet : MaratSaidov/source-code-summarization

 def split_exists(split, data_type, data_path):
     filename = os.path.join(data_path, f'{split}.{data_type}')
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedDataset.exists(filename):
         return True
     return False

Exemple #14

0

Afficher le fichier

 def indexed_dataset(path, dictionary):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedInMemoryDatasetStruct.exists(path):
         return IndexedInMemoryDatasetStruct(path,
                                             fix_lua_indexing=True)
     return None

Exemple #15

0

Afficher le fichier

 def indexed_dataset(path, dictionary):
     print("| ---- loading data from {}, is_training={}".format(
         path, is_training))
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path)
     return None

Exemple #16

0

Afficher le fichier

 def indexed_dataset(path, dictionary, debug=False):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary, debug=debug)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path,
                                       fix_lua_indexing=True,
                                       debug=debug)
     return None

Exemple #17

0

Afficher le fichier

 def split_para_exists(split, key, lang):
     filename = os.path.join(self.args.data, '{}.{}.{}'.format(split, key, lang))
     print(filename); print(self.args.raw_text)
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedDataset.exists(filename):
         return True
     return False

Exemple #18

0

Afficher le fichier

    def load_dataset_ordering(self, input_ordered_file, input_shuffled_file):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        assert self.args.raw_text and IndexedRawTextDataset.exists(
            input_shuffled_file)
        ds = IndexedRawTextDataset(input_shuffled_file, self.dictionary)
        tokens = [t for l in ds.tokens_list for t in l]

        loaded_datasets.append(
            TokenBlockDataset(
                tokens,
                ds.sizes,
                self.args.tokens_per_sample,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos(),
                break_mode=self.args.sample_break_mode,
                include_targets=True,
            ))

        print('| {} {} examples'.format(input_shuffled_file,
                                        len(loaded_datasets[-1])))

        # if not combine:
        #     break

        assert len(loaded_datasets) == 1
        dataset = loaded_datasets[0]
        sizes = dataset.sizes

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets['test'] = MonolingualDataset(
            dataset,
            sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=False,
            targets=self.targets,
        )

Exemple #19

0

Afficher le fichier

Fichier : language_modeling.py Projet : s1879281/gcn-wikicatsum

    def load_dataset(self, split):
        """Load a dataset split."""
        path = os.path.join(self.args.data, split)
        if self.args.raw_text and IndexedRawTextDataset.exists(path):
            ds = IndexedRawTextDataset(path, self.dictionary)
            tokens = ds.tokens_list
        elif not self.args.raw_text and IndexedInMemoryDataset.exists(path):
            ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
            tokens = ds.buffer
        else:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data))

        dataset = TokenBlockDataset(
            tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode,
            include_targets=True,  # return next tokens as targets
        )
        self.datasets[split] = MonolingualDataset(dataset, dataset.sizes, self.dictionary, shuffle=False)

Exemple #20

0

Afficher le fichier

    def _load_single_lang_dataset(self, split):
        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            # Since we append each block with the classification_token,
            # we need to effectively create blocks of length
            # tokens_per_sample-1
            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    ds.sizes,
                    self.args.tokens_per_sample - 1,
                    pad=self.dictionary.pad(),
                    eos=self.dictionary.eos(),
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        return dataset, sizes

Exemple #21

0

Afficher le fichier

Fichier : xmasked_seq2seq.py Projet : ljw9609/NMT-MASS

 def indexed_dataset(path, dictionary):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedDataset.exists(path):
         if self.args.lazy_load:
             return IndexedDataset(path, fix_lua_indexing=True)
         else:
             return IndexedCachedDataset(path, fix_lua_indexing=True)
     return None

Exemple #22

0

Afficher le fichier

Fichier : translation.py Projet : xdg988/Capstone-Web-UI

 def indexed_dataset(path, dictionary, copy_ext_dict=False, src_dataset=None):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary, copy_ext_dict=copy_ext_dict, src_dataset=src_dataset)
     elif IndexedDataset.exists(path):
         if self.args.lazy_load:
             return IndexedDataset(path, fix_lua_indexing=True)
         else:
             return IndexedCachedDataset(path, fix_lua_indexing=True)
     return None

Exemple #23

0

Afficher le fichier

 def indexed_dataset(path, dictionary, ex_dict=None, is_tgt=False):
     if self.args.segment:
         #if self.args.raw_text:
         return IndexedRawTextSegDataset(path, dictionary, ex_dict, is_tgt)
     else:
         if self.args.raw_text:
             return IndexedRawTextDataset(path, dictionary)
         elif IndexedDataset.exists(path):
             return IndexedCachedDataset(path, fix_lua_indexing=True)
     return None

Exemple #24

0

Afficher le fichier

Fichier : translation.py Projet : jind11/TitleStylist

 def split_exists(split, src, tgt, lang, data_path):
     if src is not None:
         filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     else:
         filename = os.path.join(data_path, '{}.{}-None.{}'.format(split, src, tgt))
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedDataset.exists(filename):
         return True
     return False

Exemple #25

0

Afficher le fichier

 def indexed_dataset(path, dictionary, cached=True, audio=False):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedDataset.exists(path):
         if cached:
             return IndexedCachedDataset(path,
                                         fix_lua_indexing=True,
                                         audio=audio)
         else:
             return IndexedDataset(path,
                                   fix_lua_indexing=True,
                                   audio=audio)
     return None

Exemple #26

0

Afficher le fichier

Fichier : sentence_pair_classification_task.py Projet : malikaltakrori/qags-1

    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = [[], []]
        loaded_labels = []
        stop = False

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            base_path = os.path.join(self.args.data, split_k)
            path1 = os.path.join(base_path + '_s1')
            path2 = os.path.join(base_path + '_s2')

            for path, datasets in zip([path1, path2], loaded_datasets):
                if self.args.raw_text and IndexedRawTextDataset.exists(path):
                    ds = IndexedRawTextDataset(path, self.dictionary)
                elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                        path):
                    ds = IndexedDataset(path, fix_lua_indexing=False)
                else:
                    if k > 0:
                        stop = True
                        break
                    else:
                        raise FileNotFoundError(
                            'Dataset not found: {} ({})'.format(
                                split, self.args.data))

                datasets.append(
                    TokenBlockDataset(
                        ds,
                        0,
                        pad=self.dictionary.pad(),
                        break_mode='eos',
                        include_targets=False,
                    ))

            if stop:
                break
            with open(base_path + '.lbl', 'r') as lbl_f:
                lines = lbl_f.readlines()
                cast = int if self.num_labels > 1 else float
                loaded_labels.extend(cast(l.rstrip()) for l in lines)

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[0][-1])))

            if not combine:
                break

        if self.num_labels == 2:
            loaded_labels = [l if l == 1 else 0 for l in loaded_labels]

        if len(loaded_datasets[0]) == 1:
            dataset1 = loaded_datasets[0][0]
            dataset2 = loaded_datasets[1][0]
            sizes1 = dataset1.sizes
            sizes2 = dataset2.sizes
        else:
            dataset1 = ConcatDataset(loaded_datasets[0])
            dataset2 = ConcatDataset(loaded_datasets[1])
            sizes1 = np.concatenate([ds.sizes for ds in loaded_datasets[0]])
            sizes2 = np.concatenate([ds.sizes for ds in loaded_datasets[1]])
        self.datasets[split] = SentencePairClassificationDataset(
            dataset1, dataset2, loaded_labels, sizes1, sizes2, self.dictionary)

Exemple #27

0

Afficher le fichier

    def load_dataset(self, split, combine=False):
        """
        Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    BlockPairDataset(
                        ds,
                        self.dictionary,
                        ds.sizes,
                        self.args.tokens_per_sample,
                        break_mode=self.args.break_mode,
                    ))

            logger.info('{} {} {} examples'.format(self.args.data, split_k,
                                                   len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = MaskedLMDataset(
            dataset=dataset,
            sizes=sizes,
            vocab=self.dictionary,
            pad_idx=self.dictionary.pad(),
            mask_idx=self.dictionary.mask(),
            classif_token_idx=self.dictionary.cls(),
            sep_token_idx=self.dictionary.sep(),
            shuffle=False,
            seed=self.seed,
        )

Exemple #28

0

Afficher le fichier

Fichier : block_transformer_lm.py Projet : malikaltakrori/qags-1

    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    ModifiedBlockPairDataset(
                        tokens,
                        ds.sizes,
                        self.args.tokens_per_sample,
                        pad=self.dictionary.pad(),
                        class_positive=self.dictionary.class_positive(),
                        class_negative=self.dictionary.class_negative(),
                        sep=self.dictionary.sep(),
                        vocab=self.dictionary,
                        break_mode=self.args.break_mode,
                        short_seq_prob=self.args.short_seq_prob,
                    ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = ModifiedBertDataset(
            dataset,
            sizes,
            self.dictionary,
            shuffle=self.args.shuffle_instance,
            seed=self.seed,
            mask_ratio=self.args.mask_ratio,
            lower=self.args.span_lower,
            upper=self.args.span_upper,
            geometric_p=self.args.geometric_p)

Exemple #29

0

Afficher le fichier

    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            tag_map = None
            if self.args.tag_bitmap_file_prefix is not None:
                print("self.args.tag_bitmap_file_prefix is not None")
                tag_map = bitarray()
                tag_map.fromfile(
                    open(self.args.tag_bitmap_file_prefix + split, 'rb'))

            block_cls = BlockPairDataset if not self.no_nsp else BlockDataset
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    block_cls(tokens,
                              ds.sizes,
                              self.args.tokens_per_sample,
                              pad=self.dictionary.pad(),
                              cls=self.dictionary.cls(),
                              mask=self.dictionary.mask(),
                              sep=self.dictionary.sep(),
                              break_mode=self.args.break_mode,
                              short_seq_prob=self.short_seq_prob,
                              tag_map=tag_map))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break
        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
        dataset_cls = SpanBertDataset if not self.no_nsp else NoNSPSpanBertDataset
        self.datasets[split] = dataset_cls(dataset,
                                           sizes,
                                           self.dictionary,
                                           shuffle=self.args.shuffle_instance,
                                           seed=self.seed,
                                           args=self.args)

Exemple #30

0

Afficher le fichier

Fichier : translation.py Projet : yuantiku/PoDA

 def indexed_dataset(path, dictionary, src_tokens=None):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary, src_tokens=src_tokens, reverse_order=self.args.reverse_order)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path, fix_lua_indexing=True)
     return None

Exemple #31

0

Afficher le fichier

Fichier : translation.py Projet : zjplab/nmt_soft_prototype

 def indexed_dataset(path, dictionary):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path)
     return None