Exemple #1
0
    def test_huffman_can_append(self):
        data1 = make_data()
        builder = make_code_builder(data1)
        coder = builder.build_code()

        with TemporaryDirectory() as dirname:
            prefix1 = os.path.join(dirname, "test1")
            build_dataset(prefix1, data1, coder)

            data2 = make_data()
            prefix2 = os.path.join(dirname, "test2")
            build_dataset(prefix2, data2, coder)

            prefix3 = os.path.join(dirname, "test3")

            with HuffmanMMapIndexedDatasetBuilder(prefix3, coder) as builder:
                builder.append(prefix1)
                builder.append(prefix2)

            dataset = HuffmanMMapIndexedDataset(prefix3)

            self.assertEqual(len(dataset), len(data1) + len(data2))

            decoded1 = [list(dataset.get_symbols(i)) for i in range(0, len(data1))]
            self.assertEqual(decoded1, data1)

            decoded2 = [
                list(dataset.get_symbols(i)) for i in range(len(data1), len(dataset))
            ]
            self.assertEqual(decoded2, data2)

            data_sizes = [i.item() for i in dataset.sizes]
            self.assertEqual(data_sizes[: len(data1)], sizes(data1))
            self.assertEqual(data_sizes[len(data1) : len(dataset)], sizes(data2))
Exemple #2
0
    def test_coder_can_encode_decode(self):
        data = make_data()
        builder = make_code_builder(data)
        coder = builder.build_code()

        encoded = [coder.encode(sentence) for sentence in data]
        decoded = [[n.symbol for n in coder.decode(enc)] for enc in encoded]

        self.assertEqual(decoded, data)

        unseen_data = make_data()
        unseen_encoded = [coder.encode(sentence) for sentence in unseen_data]
        unseen_decoded = [
            [n.symbol for n in coder.decode(enc)] for enc in unseen_encoded
        ]
        self.assertEqual(unseen_decoded, unseen_data)
Exemple #3
0
    def test_huffman_compresses(self):
        data = make_data()
        builder = make_code_builder(data)
        coder = builder.build_code()

        with TemporaryDirectory() as dirname:
            prefix = os.path.join(dirname, "huffman")
            build_dataset(prefix, data, coder)

            prefix_mmap = os.path.join(dirname, "mmap")
            mmap_builder = indexed_dataset.make_builder(
                indexed_dataset.data_file_path(prefix_mmap),
                "mmap",
                vocab_size=len(POPULATION),
            )
            dictionary = Dictionary()
            for c in POPULATION:
                dictionary.add_symbol(c)
            dictionary.finalize()
            for sentence in data:
                mmap_builder.add_item(dictionary.encode_line(" ".join(sentence)))
            mmap_builder.finalize(indexed_dataset.index_file_path(prefix_mmap))

            huff_size = os.stat(indexed_dataset.data_file_path(prefix)).st_size
            mmap_size = os.stat(indexed_dataset.data_file_path(prefix_mmap)).st_size
            self.assertLess(huff_size, mmap_size)
Exemple #4
0
    def test_can_multiprocess(self):
        with TemporaryDirectory() as dirname:
            raw_file = os.path.join(dirname, "raw1")
            prefix = os.path.join(dirname, "test1")
            impl = "mmap"
            data = make_data(out_file=raw_file)
            vocab = build_vocab(data)
            binarizer = VocabularyDatasetBinarizer(
                vocab,
                append_eos=False,
            )
            # with one worker
            summary = FileBinarizer.multiprocess_dataset(
                raw_file,
                impl,
                binarizer,
                output_prefix=prefix,
                vocab_size=len(vocab),
                num_workers=1,
            )

            self.compare_ds_data(summary, data, prefix, impl, vocab)

            # with multiple worker
            prefix_multi = os.path.join(dirname, "test2")
            summary = FileBinarizer.multiprocess_dataset(
                raw_file,
                impl,
                binarizer,
                output_prefix=prefix_multi,
                vocab_size=len(vocab),
                num_workers=3,
            )

            self.compare_ds_data(summary, data, prefix_multi, impl, vocab)
Exemple #5
0
    def test_can_binarize_file_chunk(self):
        # test without multiprocess logic
        with TemporaryDirectory() as dirname:
            raw_file = os.path.join(dirname, "raw1")
            prefix = os.path.join(dirname, "test1")
            impl = "mmap"

            data = make_data(out_file=raw_file)
            vocab = build_vocab(data)

            binarizer = VocabularyDatasetBinarizer(
                vocab,
                append_eos=False,
            )

            summary = FileBinarizer._binarize_chunk_and_finalize(
                binarizer,
                raw_file,
                offset_start=0,
                offset_end=-1,
                output_prefix=prefix,
                dataset_impl=impl,
                vocab_size=len(vocab),
            )

            self.compare_ds_data(summary, data, prefix, impl, vocab)
Exemple #6
0
    def test_code_builder_can_add(self):
        data = make_data()
        counts = make_counts(data)
        builder = make_code_builder(data)

        new_builder = builder + builder

        self.assertEqual(new_builder.symbols, counts + counts)
Exemple #7
0
    def test_code_builder_can_io(self):
        data = make_data()
        builder = make_code_builder(data)

        with NamedTemporaryFile() as tmp_fp:
            builder.to_file(tmp_fp.name)
            other_builder = HuffmanCodeBuilder.from_file(tmp_fp.name)

            self.assertEqual(builder.symbols, other_builder.symbols)
Exemple #8
0
    def test_masks_tokens(self):
        with TemporaryDirectory() as dirname:

            # prep input file
            raw_file = os.path.join(dirname, "raw")
            data = make_data(out_file=raw_file)
            vocab = build_vocab(data)

            # binarize
            binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False)
            split = "train"
            bin_file = os.path.join(dirname, split)
            FileBinarizer.multiprocess_dataset(
                input_file=raw_file,
                binarizer=binarizer,
                dataset_impl="mmap",
                vocab_size=len(vocab),
                output_prefix=bin_file,
            )

            # setup task
            cfg = MaskedLMConfig(
                data=dirname,
                seed=42,
                mask_prob=0.5,  # increasing the odds of masking
                random_token_prob=0,  # avoiding random tokens for exact match
                leave_unmasked_prob=0,  # always masking for exact match
            )
            task = MaskedLMTask(cfg, binarizer.dict)

            original_dataset = task._load_dataset_split(bin_file, 1, False)

            # load datasets
            task.load_dataset(split)
            masked_dataset = task.dataset(split)

            mask_index = task.source_dictionary.index("<mask>")
            iterator = task.get_batch_iterator(
                dataset=masked_dataset,
                max_tokens=65_536,
                max_positions=4_096,
            ).next_epoch_itr(shuffle=False)
            for batch in iterator:
                for sample in range(len(batch)):
                    net_input = batch["net_input"]
                    masked_src_tokens = net_input["src_tokens"][sample]
                    masked_src_length = net_input["src_lengths"][sample]
                    masked_tgt_tokens = batch["target"][sample]

                    sample_id = batch["id"][sample]
                    original_tokens = original_dataset[sample_id]
                    original_tokens = original_tokens.masked_select(
                        masked_src_tokens[:masked_src_length] == mask_index)
                    masked_tokens = masked_tgt_tokens.masked_select(
                        masked_tgt_tokens != task.source_dictionary.pad())

                    assert masked_tokens.equal(original_tokens)
Exemple #9
0
    def test_coder_can_io(self):
        data = make_data()
        builder = make_code_builder(data)
        coder = builder.build_code()

        with NamedTemporaryFile() as tmp_fp:
            coder.to_file(tmp_fp.name)
            other_coder = HuffmanCoder.from_file(tmp_fp.name)

            self.assertEqual(coder, other_coder)
Exemple #10
0
    def test_huffman_can_encode_decode(self):
        data = make_data()
        builder = make_code_builder(data)
        coder = builder.build_code()

        with TemporaryDirectory() as dirname:
            prefix = os.path.join(dirname, "test1")
            build_dataset(prefix, data, coder)
            dataset = HuffmanMMapIndexedDataset(prefix)

            self.assertEqual(len(dataset), len(data))
            decoded = [list(dataset.get_symbols(i)) for i in range(0, len(dataset))]

            self.assertEqual(decoded, data)
            data_sizes = [i.item() for i in dataset.sizes]
            self.assertEqual(data_sizes, sizes(data))
Exemple #11
0
    def test_can_binarize_line(self):
        data = make_data(length=1)
        vocab = build_vocab(data)

        binarizer = VocabularyDatasetBinarizer(vocab, )

        sentence = data[0]
        summary = BinarizeSummary()

        tensor = binarizer.binarize_line(
            " ".join(sentence),
            summary,
        )

        self.assertEqual(len(tensor), len(sentence) + 1)

        self.assertEqual(summary.num_tok, len(sentence) + 1)
        self.assertEqual(summary.num_seq, 1)
    def test_multilingual_denoising(self):
        with TemporaryDirectory() as dirname:

            # prep input file
            lang_dir = os.path.join(dirname, "en")
            os.mkdir(lang_dir)
            raw_file = os.path.join(lang_dir, "raw")
            data = make_data(out_file=raw_file)
            vocab = build_vocab(data)

            # binarize
            binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False)
            split = "train"
            bin_file = os.path.join(lang_dir, split)
            dataset_impl = "mmap"
            FileBinarizer.multiprocess_dataset(
                input_file=raw_file,
                binarizer=binarizer,
                dataset_impl=dataset_impl,
                vocab_size=len(vocab),
                output_prefix=bin_file,
            )

            # setup task
            train_args = options.parse_args_and_arch(
                options.get_training_parser(),
                [
                    "--task",
                    "multilingual_denoising",
                    "--arch",
                    "bart_base",
                    "--seed",
                    "42",
                    "--mask-length",
                    "word",
                    "--permute-sentences",
                    "1",
                    "--rotate",
                    "0",
                    "--replace-length",
                    "-1",
                    "--mask",
                    "0.2",
                    dirname,
                ],
            )
            cfg = convert_namespace_to_omegaconf(train_args)
            task = MultilingualDenoisingTask(cfg.task, binarizer.dict)

            # load datasets
            original_dataset = task._load_dataset_split(bin_file, 1, False)
            task.load_dataset(split)
            masked_dataset = task.dataset(split)

            iterator = task.get_batch_iterator(
                dataset=masked_dataset,
                max_tokens=65_536,
                max_positions=4_096,
            ).next_epoch_itr(shuffle=False)
            mask_index = task.source_dictionary.index("<mask>")
            for batch in iterator:
                for sample in range(len(batch)):
                    net_input = batch["net_input"]
                    masked_src_tokens = net_input["src_tokens"][sample]
                    masked_src_length = net_input["src_lengths"][sample]
                    masked_tgt_tokens = batch["target"][sample]

                    sample_id = batch["id"][sample]
                    original_tokens = original_dataset[sample_id]
                    original_tokens = original_tokens.masked_select(
                        masked_src_tokens[:masked_src_length] == mask_index)
                    masked_tokens = masked_tgt_tokens.masked_select(
                        masked_src_tokens == mask_index)

                    assert masked_tokens.equal(original_tokens)
Exemple #13
0
    def test_code_builder_can_count(self):
        data = make_data()
        counts = make_counts(data)
        builder = make_code_builder(data)

        self.assertEqual(builder.symbols, counts)
Exemple #14
0
    def test_masks_token_spans(self):
        with TemporaryDirectory() as dirname:

            # prep input file
            raw_file = os.path.join(dirname, "raw")
            data = make_data(out_file=raw_file)
            vocab = build_vocab(data)

            # binarize
            binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False)
            split = "train"
            bin_file = os.path.join(dirname, split)
            dataset_impl = "mmap"

            FileBinarizer.multiprocess_dataset(
                input_file=raw_file,
                binarizer=binarizer,
                dataset_impl=dataset_impl,
                vocab_size=len(vocab),
                output_prefix=bin_file,
            )

            # adding sentinel tokens
            for i in range(100):
                vocab.add_symbol(f"<extra_id_{i}>")

            # setup task
            train_args = options.parse_args_and_arch(
                options.get_training_parser(),
                [
                    "--task",
                    "span_masked_lm",
                    "--arch",
                    "bart_base",
                    "--seed",
                    "42",
                    dirname,
                ],
            )
            cfg = convert_namespace_to_omegaconf(train_args)
            task = SpanMaskedLMTask(cfg.task, binarizer.dict)

            # load datasets
            original_dataset = task._load_dataset_split(bin_file, 1, False)
            task.load_dataset(split)
            masked_dataset = task.dataset(split)

            iterator = task.get_batch_iterator(
                dataset=masked_dataset,
                max_tokens=65_536,
                max_positions=4_096,
            ).next_epoch_itr(shuffle=False)
            num_tokens = len(vocab)
            for batch in iterator:
                for sample in range(len(batch)):
                    sample_id = batch["id"][sample]
                    original_tokens = original_dataset[sample_id]
                    masked_src_tokens = batch["net_input"]["src_tokens"][
                        sample]
                    masked_src_length = batch["net_input"]["src_lengths"][
                        sample]
                    masked_tgt_tokens = batch["target"][sample]

                    original_offset = 0
                    masked_tgt_offset = 0
                    extra_id_token = len(vocab) - 1
                    for masked_src_token in masked_src_tokens[:
                                                              masked_src_length]:
                        if masked_src_token == extra_id_token:
                            assert (masked_src_token ==
                                    masked_tgt_tokens[masked_tgt_offset])
                            extra_id_token -= 1
                            masked_tgt_offset += 1
                            while (original_offset < len(original_tokens)
                                   and masked_tgt_tokens[masked_tgt_offset] !=
                                   extra_id_token):
                                assert (original_tokens[original_offset] ==
                                        masked_tgt_tokens[masked_tgt_offset])
                                original_offset += 1
                                masked_tgt_offset += 1
                        else:
                            assert original_tokens[
                                original_offset] == masked_src_token
                            original_offset += 1