Beispiel #1
0
 def indexed_dataset(path, dictionary):
     if self.args.raw_text:
         tokenizer_tool = tokenizer.build_tokenizer(self.args)
         return IndexedRawTextDataset(tokenizer_tool, path, dictionary)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path, fix_lua_indexing=True)
     return None
Beispiel #2
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    BlockDataset(
                        tokens,
                        ds.sizes,
                        self.args.tokens_per_sample,
                    ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        if split == 'valid':
            fix_seed = True
        else:
            fix_seed = False

        self.datasets[split] = BertDataset(
            dataset, sizes, self.dictionary, self.args.shuffle_instance,
            self.seed, fix_seed, self.args.token_mask_ratio,
            self.args.token_noise_prob, self.args.token_clean_prob,
            self.args.sent_pos_mask_ratio, self.args.sent_pos_noise_prob,
            self.args.sent_pos_clean_prob)
Beispiel #3
0
 def indexed_dataset(path, dictionary):
     print("| ---- loading data from {}, is_training={}".format(
         path, is_training))
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path)
     return None
Beispiel #4
0
 def indexed_dataset(path, dictionary, debug=False):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary, debug=debug)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path,
                                       fix_lua_indexing=True,
                                       debug=debug)
     return None
Beispiel #5
0
    def load_dataset(self, split, combine=False):
        """Load a dataset split."""

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            cbt_booktitle_idx = None
            if self.args.sample_break_mode == 'cbt_booktitle':
                if self.dictionary.index(
                        '_BOOK_TITLE_') != self.dictionary.unk():
                    cbt_booktitle_idx = self.dictionary.index('_BOOK_TITLE_')

            loaded_datasets.append(
                TokenBlockDataset(
                    tokens,
                    ds.sizes,
                    self.args.tokens_per_sample,
                    self.args.sample_break_mode,
                    include_targets=True,
                    cbt_booktitle_idx=cbt_booktitle_idx,
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = MonolingualDataset(dataset,
                                                  sizes,
                                                  self.dictionary,
                                                  shuffle=False)
Beispiel #6
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    tokens, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(),
                    break_mode=self.args.sample_break_mode, include_targets=True,
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset, sizes, self.dictionary, self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False,
            targets=self.targets,
        )
Beispiel #7
0
 def split_exists(split, data_path):
     filename = os.path.join(data_path, split)
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename):
         return True
     return False
Beispiel #8
0
 def split_exists(src, tgt, lang):
     filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename):
         return True
     return False
Beispiel #9
0
 def split_exists(split, src, tgt, lang):
     filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang))
     if self.args.raw_text and IndexedRawTextDataset.exists(filename):
         return True
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename):
         return True
     return False
Beispiel #10
0
 def indexed_dataset(path, dictionary):
     if self.args.raw_text and IndexedRawTextDataset.exists(path):
         return IndexedRawTextDataset(path, dictionary)
     elif not self.args.raw_text and IndexedInMemoryDataset.exists(
             path):
         return IndexedDataset(path, fix_lua_indexing=False)
     return None
Beispiel #11
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []
        loaded_labels = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedDataset(path, fix_lua_indexing=False)
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    ds,
                    0,
                    pad=self.dictionary.pad(),
                    break_mode='eos',
                    include_targets=False,
                ))

            with open(path + '.lbl', 'r') as lbl_f:
                lines = lbl_f.readlines()
                loaded_labels.extend(int(l) for l in lines)

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = SentenceClassificationDataset(
            dataset,
            loaded_labels,
            sizes,
            self.dictionary,
        )
    def load_dataset(self, split):
        """Load a dataset split."""
        path = os.path.join(self.args.data, split)
        if self.args.raw_text and IndexedRawTextDataset.exists(path):
            ds = IndexedRawTextDataset(path, self.dictionary)
            tokens = ds.tokens_list
        elif not self.args.raw_text and IndexedInMemoryDataset.exists(path):
            ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
            tokens = ds.buffer
        else:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data))

        dataset = TokenBlockDataset(
            tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode,
            include_targets=True,  # return next tokens as targets
        )
        self.datasets[split] = MonolingualDataset(dataset, dataset.sizes, self.dictionary, shuffle=False)
Beispiel #13
0
 def indexed_dataset(path, dictionary):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path, fix_lua_indexing=True)
     return None
Beispiel #14
0
    def load_dataset(self, split, combine=False):
        """Load a dataset split."""

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            token_path = os.path.join(self.args.data, split_k)

            if IndexedInMemoryDataset.exists(token_path):
                token_ds = IndexedInMemoryDataset(token_path,
                                                  fix_lua_indexing=True)
                tokens = token_ds.buffer

                sizes = token_ds.sizes

                in_tsv_file_path = os.path.join(self.args.data,
                                                f'gap-{split}.bert.tsv')
                gap_reader = GAP_Reader(in_tsv_file_path, is_gold=True)
                gap_data = gap_reader.read()

                in_bert_file_path = os.path.join(self.args.data,
                                                 f'gap-{split}.bert.jsonl')

                gap_bert_reader = Bert_Reader(in_bert_file_path)
                gap_bert_data = gap_bert_reader.read()
                gap_bert_weights = [
                    bert_weights for _, bert_weights in gap_bert_data
                ]

                gap_texts = [d.text.split() for d in gap_data]
                assert np.array_equal(sizes, [len(t) + 1 for t in gap_texts])
                assert np.array_equal(
                    sizes,
                    [len(bert_tokens) + 1 for bert_tokens, _ in gap_bert_data])
                assert np.array_equal(
                    [d.text.split(" ") for d in gap_data],
                    [bert_tokens for bert_tokens, _ in gap_bert_data])

                gap_corefs = self.generate_gap_coref_supervision(
                    gap_data, sizes)
                assert len(gap_data) == len(gap_corefs)

            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))

            loaded_datasets.append(
                TokenBlockGapBertDataset(
                    tokens,
                    sizes,
                    self.args.tokens_per_sample,
                    gap_data,
                    gap_corefs,
                    gap_bert_weights,
                    break_mode=self.args.sample_break_mode,
                    include_targets=True))

            if split == "train":
                gap_dataset = TokenBlockGapBertDataset(
                    tokens,
                    sizes,
                    self.args.tokens_per_sample,
                    gap_data,
                    gap_corefs,
                    gap_bert_weights,
                    self.args.sample_break_mode,
                    include_targets=True)
                self.datasets["train_gap_only"] = MonolingualGapBertDataset(
                    gap_dataset,
                    gap_dataset.sizes,
                    self.token_dictionary,
                    shuffle=False)

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = MonolingualGapBertDataset(dataset,
                                                         sizes,
                                                         self.token_dictionary,
                                                         shuffle=False)
Beispiel #15
0
 def indexed_dataset(path, dictionary):
     if args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path, fix_lua_indexing=True)
     return None
Beispiel #16
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            tag_map = None
            if self.args.tag_bitmap_file_prefix is not None:
                print("self.args.tag_bitmap_file_prefix is not None")
                tag_map = bitarray()
                tag_map.fromfile(
                    open(self.args.tag_bitmap_file_prefix + split, 'rb'))

            block_cls = BlockPairDataset if not self.no_nsp else BlockDataset
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    block_cls(tokens,
                              ds.sizes,
                              self.args.tokens_per_sample,
                              pad=self.dictionary.pad(),
                              cls=self.dictionary.cls(),
                              mask=self.dictionary.mask(),
                              sep=self.dictionary.sep(),
                              break_mode=self.args.break_mode,
                              short_seq_prob=self.short_seq_prob,
                              tag_map=tag_map))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break
        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
        dataset_cls = SpanBertDataset if not self.no_nsp else NoNSPSpanBertDataset
        self.datasets[split] = dataset_cls(dataset,
                                           sizes,
                                           self.dictionary,
                                           shuffle=self.args.shuffle_instance,
                                           seed=self.seed,
                                           args=self.args)
Beispiel #17
0
 def indexed_dataset(path, dictionary, src_tokens=None):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary, src_tokens=src_tokens, reverse_order=self.args.reverse_order)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path, fix_lua_indexing=True)
     return None
 def indexed_dataset(path, dictionary):
     if self.args.raw_text:
         return IndexedRawTextDataset(path, dictionary)
     elif IndexedInMemoryDataset.exists(path):
         return IndexedInMemoryDataset(path)
     return None
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                    path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=False)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError(
                        'Dataset not found: {} ({})'.format(
                            split, self.args.data))
            with data_utils.numpy_seed(self.seed + k):
                loaded_datasets.append(
                    ModifiedBlockPairDataset(
                        tokens,
                        ds.sizes,
                        self.args.tokens_per_sample,
                        pad=self.dictionary.pad(),
                        class_positive=self.dictionary.class_positive(),
                        class_negative=self.dictionary.class_negative(),
                        sep=self.dictionary.sep(),
                        vocab=self.dictionary,
                        break_mode=self.args.break_mode,
                        short_seq_prob=self.args.short_seq_prob,
                    ))

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = ModifiedBertDataset(
            dataset,
            sizes,
            self.dictionary,
            shuffle=self.args.shuffle_instance,
            seed=self.seed,
            mask_ratio=self.args.mask_ratio,
            lower=self.args.span_lower,
            upper=self.args.span_upper,
            geometric_p=self.args.geometric_p)
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        loaded_datasets = [[], []]
        loaded_labels = []
        stop = False

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            base_path = os.path.join(self.args.data, split_k)
            path1 = os.path.join(base_path + '_s1')
            path2 = os.path.join(base_path + '_s2')

            for path, datasets in zip([path1, path2], loaded_datasets):
                if self.args.raw_text and IndexedRawTextDataset.exists(path):
                    ds = IndexedRawTextDataset(path, self.dictionary)
                elif not self.args.raw_text and IndexedInMemoryDataset.exists(
                        path):
                    ds = IndexedDataset(path, fix_lua_indexing=False)
                else:
                    if k > 0:
                        stop = True
                        break
                    else:
                        raise FileNotFoundError(
                            'Dataset not found: {} ({})'.format(
                                split, self.args.data))

                datasets.append(
                    TokenBlockDataset(
                        ds,
                        0,
                        pad=self.dictionary.pad(),
                        break_mode='eos',
                        include_targets=False,
                    ))

            if stop:
                break
            with open(base_path + '.lbl', 'r') as lbl_f:
                lines = lbl_f.readlines()
                cast = int if self.num_labels > 1 else float
                loaded_labels.extend(cast(l.rstrip()) for l in lines)

            print('| {} {} {} examples'.format(self.args.data, split_k,
                                               len(loaded_datasets[0][-1])))

            if not combine:
                break

        if self.num_labels == 2:
            loaded_labels = [l if l == 1 else 0 for l in loaded_labels]

        if len(loaded_datasets[0]) == 1:
            dataset1 = loaded_datasets[0][0]
            dataset2 = loaded_datasets[1][0]
            sizes1 = dataset1.sizes
            sizes2 = dataset2.sizes
        else:
            dataset1 = ConcatDataset(loaded_datasets[0])
            dataset2 = ConcatDataset(loaded_datasets[1])
            sizes1 = np.concatenate([ds.sizes for ds in loaded_datasets[0]])
            sizes2 = np.concatenate([ds.sizes for ds in loaded_datasets[1]])
        self.datasets[split] = SentencePairClassificationDataset(
            dataset1, dataset2, loaded_labels, sizes1, sizes2, self.dictionary)