Exemple #1
0
    def test_read(self):
        name = "my_name"
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_infos = [train_info, test_info]
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)
        reader = ReaderTest("", info)

        instructions = "test[:33%]"
        dset = Dataset(**reader.read(name, instructions, split_infos))
        self.assertEqual(dset["filename"][0], f"{name}-test")
        self.assertEqual(dset.num_rows, 33)
        self.assertEqual(dset.num_columns, 1)

        instructions = ["train", "test[:33%]"]
        datasets_kwargs = [
            reader.read(name, instr, split_infos) for instr in instructions
        ]
        train_dset, test_dset = [
            Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs
        ]
        self.assertEqual(train_dset["filename"][0], f"{name}-train")
        self.assertEqual(train_dset.num_rows, 100)
        self.assertEqual(train_dset.num_columns, 1)
        self.assertEqual(test_dset["filename"][0], f"{name}-test")
        self.assertEqual(test_dset.num_rows, 33)
        self.assertEqual(test_dset.num_columns, 1)
Exemple #2
0
    def finalize(self, timeout=100):
        """Close all the writing process and load/gather the data
        from all the nodes if main node or all_process is True.
        """
        if self.writer is not None:
            self.writer.finalize()
        self.writer = None
        if self.filelock is not None:
            self.filelock.release()

        if self.keep_in_memory:
            # Read the predictions and references
            reader = ArrowReader(path=self.data_dir, info=None)
            self.data = Dataset.from_buffer(self.buf_writer.getvalue())

        elif self.process_id == 0:
            # Let's acquire a lock on each node files to be sure they are finished writing
            file_paths, filelocks = self._get_all_cache_files(timeout=timeout)

            # Read the predictions and references
            try:
                reader = ArrowReader(path=self.data_dir, info=None)
                self.data = Dataset(**reader.read_files([{
                    "filename": f
                } for f in file_paths]))
            except FileNotFoundError:
                raise ValueError(
                    "Another metric instance is already using the local cache file. "
                    "Please specify an experiment_id to avoid colision between distributed metric instances."
                )

            # Store file paths and locks and we will release/delete them after the computation.
            self.file_paths = file_paths
            self.filelocks = filelocks
 def _create_dummy_dataset(self) -> Dataset:
     dset = Dataset(
         pa.Table.from_pydict({
             "filename": [
                 "my_name-train" + "_" + str(x)
                 for x in np.arange(30).tolist()
             ]
         }))
     return dset
Exemple #4
0
 def _create_dummy_dataset(self):
     name = "my_name"
     train_info = SplitInfo(name="train", num_examples=30)
     test_info = SplitInfo(name="test", num_examples=30)
     split_infos = [train_info, test_info]
     split_dict = SplitDict()
     split_dict.add(train_info)
     split_dict.add(test_info)
     info = DatasetInfo(splits=split_dict)
     reader = ReaderTester("", info)
     dset = Dataset(**reader.read(name, "train", split_infos))
     return dset
Exemple #5
0
    def test_read_files(self):
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)
        reader = ReaderTest("", info)

        files = [{
            "filename": "train"
        }, {
            "filename": "test",
            "skip": 10,
            "take": 10
        }]
        dset = Dataset(**reader.read_files(files, original_instructions=""))
        self.assertEqual(dset.num_rows, 110)
        self.assertEqual(dset.num_columns, 1)
        self.assertEqual(dset._data_files, files)