Esempio n. 1
0
 def test_from_arrow_schema_simple(self):
     data = {"a": [{"b": {"c": "text"}}] * 10, "foo": [1] * 10}
     original_features = Features({
         "a": {
             "b": {
                 "c": Value("string")
             }
         },
         "foo": Value("int64")
     })
     dset = Dataset.from_dict(data, features=original_features)
     new_features = Features.from_arrow_schema(dset.schema)
     new_dset = Dataset.from_dict(data, features=new_features)
     self.assertDictEqual(dset[0], new_dset[0])
     self.assertDictEqual(dset[:], new_dset[:])
Esempio n. 2
0
    def test_keep_features_after_transform_from_file(self):
        features = Features(
            {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))}
        )
        dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features)

        def invert_labels(x):
            return {"labels": [(1 - label) for label in x["labels"]]}

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_file = os.path.join(tmp_dir, "test.arrow")
            dset.map(invert_labels, cache_file_name=tmp_file)
            inverted_dset = Dataset.from_file(tmp_file)
            self.assertEqual(inverted_dset.features.type, features.type)
            self.assertDictEqual(inverted_dset.features, features)
Esempio n. 3
0
    def test_from_dict(self):
        data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
        dset = Dataset.from_dict(data)
        self.assertListEqual(dset["col_1"], data["col_1"])
        self.assertListEqual(dset["col_2"], data["col_2"])
        self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"])

        features = Features({"col_1": Value("int64"), "col_2": Value("string")})
        dset = Dataset.from_dict(data, features=features)
        self.assertListEqual(dset["col_1"], data["col_1"])
        self.assertListEqual(dset["col_2"], data["col_2"])
        self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"])

        features = Features({"col_1": Value("string"), "col_2": Value("string")})
        self.assertRaises(pa.ArrowTypeError, Dataset.from_dict, data, features=features)
Esempio n. 4
0
 def test_flatten(self):
     dset = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset.flatten()
     self.assertListEqual(dset.column_names, ["a.b.c", "foo"])
     self.assertListEqual(list(dset.features.keys()), ["a.b.c", "foo"])
     self.assertDictEqual(
         dset.features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
Esempio n. 5
0
 def _create_dummy_dataset(self) -> Dataset:
     dset = Dataset(
         pa.Table.from_pydict({
             "filename": [
                 "my_name-train" + "_" + str(x)
                 for x in np.arange(30).tolist()
             ]
         }))
     return dset
Esempio n. 6
0
    def test_concatenate(self):
        data1, data2, data3 = {
            "id": [0, 1, 2]
        }, {
            "id": [3, 4, 5]
        }, {
            "id": [6, 7]
        }
        dset1, dset2, dset3 = Dataset.from_dict(data1), Dataset.from_dict(
            data2), Dataset.from_dict(data3)
        dset1._info = DatasetInfo(description="Dataset1")
        dset2._info = DatasetInfo(description="Dataset2")
        dset3._info = None

        dset_concat = concatenate_datasets([dset1, dset2, dset3])
        self.assertEquals(len(dset_concat),
                          len(dset1) + len(dset2) + len(dset3))
        self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2")
Esempio n. 7
0
    def test_concatenate(self):
        data1, data2, data3 = {
            "id": [0, 1, 2]
        }, {
            "id": [3, 4, 5]
        }, {
            "id": [6, 7]
        }
        info1 = DatasetInfo(description="Dataset1")
        info2 = DatasetInfo(description="Dataset2")
        dset1, dset2, dset3 = (
            Dataset.from_dict(data1, info=info1),
            Dataset.from_dict(data2, info=info2),
            Dataset.from_dict(data3),
        )

        dset_concat = concatenate_datasets([dset1, dset2, dset3])
        self.assertEquals(len(dset_concat),
                          len(dset1) + len(dset2) + len(dset3))
        self.assertEquals(dset_concat.info.description,
                          "Dataset1\n\nDataset2\n\n")
Esempio n. 8
0
    def test_keep_features_after_transform_in_memory(self):
        features = Features(
            {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))}
        )
        dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features)

        def invert_labels(x):
            return {"labels": [(1 - label) for label in x["labels"]]}

        inverted_dset = dset.map(invert_labels, keep_in_memory=True)
        self.assertEqual(inverted_dset.features.type, features.type)
        self.assertDictEqual(inverted_dset.features, features)
Esempio n. 9
0
 def _create_dummy_dataset(self):
     name = "my_name"
     train_info = SplitInfo(name="train", num_examples=30)
     test_info = SplitInfo(name="test", num_examples=30)
     split_infos = [train_info, test_info]
     split_dict = SplitDict()
     split_dict.add(train_info)
     split_dict.add(test_info)
     info = DatasetInfo(splits=split_dict)
     reader = ReaderTester("", info)
     dset = Dataset(**reader.read(name, "train", split_infos))
     return dset
Esempio n. 10
0
    def test_read_files(self):
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)
        reader = ReaderTest("", info)

        files = [{
            "filename": "train"
        }, {
            "filename": "test",
            "skip": 10,
            "take": 10
        }]
        dset = Dataset(**reader.read_files(files, original_instructions=""))
        self.assertEqual(dset.num_rows, 110)
        self.assertEqual(dset.num_columns, 1)
        self.assertEqual(dset._data_files, files)
Esempio n. 11
0
 def _create_dummy_dataset(self):
     dset = Dataset.from_dict({
         "filename":
         ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()]
     })
     return dset
Esempio n. 12
0
class Metric(object):
    def __init__(
        self,
        keep_in_memory: bool = False,
        data_dir: Optional[str] = None,
        num_process: int = 1,
        process_id: int = 0,
        seed: Optional[int] = None,
        config_name: Optional[str] = None,
        experiment_id: Optional[str] = None,
        max_concurrent_cache_files: int = 10000,
        **kwargs,
    ):
        """A Metrics is the base class and common API for all metrics.
        Args:
            keep_in_memory (``bool``): keep all predictions and references in memory. Not possible in distributed settings.
            data_dir (``str``): Path to a directory in which temporary prediction/references data will be stored.
                The data directory should be located on a shared file-system in distributed setups.
            num_process (``int``): specify the total number of nodes in a distributed settings.
                This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
            process_id (``int``): specify the id of the current process in a distributed setup (between 0 and num_process-1)
                This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
            seed (Optional ``int``): If specified, this will temporarily set numpy's random seed when :func:`nlp.Metric.compute` is run.
            config_name (``str``): This is used to define a hash specific to a metrics computation script and prevents the metric's data
                to be overridden when the metric loading script is modified.
            experiment_id (``str``): A specific experiment id. This is used if several distributed evaluations share the same file system.
                This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
            max_concurrent_cache_files (``int``): Max number of concurrent metrics cache files (default 10000).
        """
        # Metric name
        self.name = camelcase_to_snakecase(self.__class__.__name__)
        # Configuration name
        self.config_name: str = config_name or "default_config"
        # Experiment id
        self.experiment_id: str = experiment_id or "default_experiment"

        # Safety checks on num_process and process_id
        assert isinstance(
            process_id, int
        ) and process_id >= 0, "'process_id' should be a number greater than 0"
        assert (isinstance(num_process, int) and num_process > process_id
                ), "'num_process' should be a number greater than process_id"
        assert (
            num_process == 1 or not keep_in_memory
        ), "Using 'keep_in_memory' is not possible in distributed setting (num_process > 1)."
        self.num_process = num_process
        self.process_id = process_id
        self.max_concurrent_cache_files = max_concurrent_cache_files

        self.keep_in_memory = keep_in_memory
        self._data_dir_root = os.path.expanduser(data_dir or HF_METRICS_CACHE)
        self.data_dir = self._build_data_dir()
        self.seed: int = seed or np.random.get_state()[1][0]

        # prepare info
        info = self._info()
        info.metric_name = self.name
        info.config_name = self.config_name
        info.experiment_id = self.experiment_id
        self.info = info

        # Update 'compute' and 'add' docstring
        # methods need to be copied otherwise it changes the docstrings of every instance
        self.compute = types.MethodType(copyfunc(self.compute), self)
        self.add_batch = types.MethodType(copyfunc(self.add_batch), self)
        self.add = types.MethodType(copyfunc(self.add), self)
        self.compute.__func__.__doc__ += self.info.inputs_description
        self.add_batch.__func__.__doc__ += self.info.inputs_description
        self.add.__func__.__doc__ += self.info.inputs_description

        # self.arrow_schema = pa.schema(field for field in self.info.features.type)
        self.buf_writer = None
        self.writer = None
        self.writer_batch_size = None
        self.data = None

        # This is the cache file we store our predictions/references in
        # Keep it None for now so we can (cloud)pickle the object
        self.cache_file_name = None
        self.filelock = None

        # This is all the cache files on which we have a lock when we are in a distributed setting
        self.file_paths = None
        self.filelocks = None

    def _build_data_dir(self):
        """Path of this metric in cache_dir:
        Will be:
            self._data_dir_root/self.name/self.config_name/self.hash (if not none)/
        If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
        """
        builder_data_dir = self._data_dir_root
        builder_data_dir = os.path.join(builder_data_dir, self.name,
                                        self.config_name)
        os.makedirs(builder_data_dir, exist_ok=True)
        return builder_data_dir

    def _create_cache_file(self, timeout=1) -> Tuple[str, FileLock]:
        """ Create a new cache file. If the default cache file is used, we generated a new hash. """
        file_path = os.path.join(
            self.data_dir,
            f"{self.experiment_id}-{self.num_process}-{self.process_id}.arrow")

        for i in range(self.max_concurrent_cache_files):
            filelock = FileLock(file_path + ".lock")
            try:
                filelock.acquire(timeout=timeout)
            except Timeout:
                # If we have reached the max number of attempts or we are not allow to find a free name (distributed setup)
                # We raise an error
                if self.num_process != 1:
                    raise ValueError(
                        f"Another metric instance is already using the local cache file at {file_path}. "
                        f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid colision "
                        f"between distributed metric instances.")
                if i == self.max_concurrent_cache_files - 1:
                    raise ValueError(
                        f"Cannot acquire lock, too many metric instance are operating concurrently on this file system."
                        f"You should set a larger value of max_concurrent_cache_files when creating the metric "
                        f"(current value is {self.max_concurrent_cache_files})."
                    )
                # In other cases (allow to find new file name + not yet at max num of attempts) we can try to sample a new hashing name.
                file_uuid = str(uuid.uuid4())
                file_path = os.path.join(
                    self.data_dir,
                    f"{self.experiment_id}-{file_uuid}-{self.num_process}-{self.process_id}.arrow"
                )
            else:
                break

        return file_path, filelock

    def _get_all_cache_files(self,
                             timeout=100) -> Tuple[List[str], List[FileLock]]:
        """Get a lock on all the cache files in a distributed setup.
        We wait for timeout second to let all the distributed node finish their tasks (default is 100 seconds).
        """
        if self.num_process == 1:
            file_paths = [self.cache_file_name]
        else:
            file_paths = [
                os.path.join(
                    self.data_dir,
                    f"{self.experiment_id}-{self.num_process}-{process_id}.arrow"
                ) for process_id in range(self.num_process)
            ]

        # Let's acquire a lock on each process files to be sure they are finished writing
        filelocks = []
        for process_id, file_path in enumerate(file_paths):
            filelock = FileLock(file_path + ".lock")
            try:
                filelock.acquire(timeout=timeout)
            except Timeout:
                raise ValueError(
                    f"Cannot acquire lock on cached file {file_path} for process {process_id}."
                )
            else:
                filelocks.append(filelock)

        return file_paths, filelocks

    def finalize(self, timeout=100):
        """Close all the writing process and load/gather the data
        from all the nodes if main node or all_process is True.
        """
        if self.writer is not None:
            self.writer.finalize()
        self.writer = None
        if self.filelock is not None:
            self.filelock.release()

        if self.keep_in_memory:
            # Read the predictions and references
            reader = ArrowReader(path=self.data_dir, info=None)
            self.data = Dataset.from_buffer(self.buf_writer.getvalue())

        elif self.process_id == 0:
            # Let's acquire a lock on each node files to be sure they are finished writing
            file_paths, filelocks = self._get_all_cache_files(timeout=timeout)

            # Read the predictions and references
            try:
                reader = ArrowReader(path=self.data_dir, info=None)
                self.data = Dataset(**reader.read_files([{
                    "filename": f
                } for f in file_paths]))
            except FileNotFoundError:
                raise ValueError(
                    "Another metric instance is already using the local cache file. "
                    "Please specify an experiment_id to avoid colision between distributed metric instances."
                )

            # Store file paths and locks and we will release/delete them after the computation.
            self.file_paths = file_paths
            self.filelocks = filelocks

    def compute(self, *args, **kwargs) -> Optional[dict]:
        """Compute the metrics.

        Args:
            We disallow the usage of positional arguments to prevent mistakes
            `predictions` (Optional list/array/tensor): predictions
            `references` (Optional list/array/tensor): references
            `timeout` (Optional int): timeout for distributed gathering of values on several nodes
            `**kwargs` (Optional other kwargs): will be forwared to the metrics :func:`_compute` method (see details in the docstring)

        Return:
            Dictionnary with the metrics if this metric is run on the main process (process_id == 0)
            None if the metric is not run on the main process (process_id != 0)
        """
        if args:
            raise ValueError("Please call `compute` using keyword arguments.")

        predictions = kwargs.pop("predictions", None)
        references = kwargs.pop("references", None)
        timeout = kwargs.pop("timeout", 120)

        if predictions is not None:
            self.add_batch(predictions=predictions, references=references)
        self.finalize(timeout=timeout)

        self.cache_file_name = None
        self.filelock = None

        if self.process_id == 0:
            self.data.set_format(type=self.info.format)

            predictions = self.data["predictions"]
            references = self.data["references"]
            with temp_seed(self.seed):
                output = self._compute(predictions=predictions,
                                       references=references,
                                       **kwargs)

            if self.buf_writer is not None:
                self.buf_writer = None
            else:
                # Release locks and delete all the cache files
                for filelock, file_path in zip(self.filelocks,
                                               self.file_paths):
                    logger.info(f"Removing {file_path}")
                    os.remove(file_path)
                    filelock.release()

            return output
        else:
            return None

    def add_batch(self, *, predictions=None, references=None):
        """
        Add a batch of predictions and references for the metric's stack.
        """
        batch = {"predictions": predictions, "references": references}
        batch = self.info.features.encode_batch(batch)
        if self.writer is None:
            self._init_writer()
        self.writer.write_batch(batch)

    def add(self, *, prediction=None, reference=None):
        """Add one prediction and reference for the metric's stack."""
        example = {"predictions": prediction, "references": reference}
        example = self.info.features.encode_example(example)
        if self.writer is None:
            self._init_writer()
        self.writer.write(example)

    def _init_writer(self):
        if self.keep_in_memory:
            self.buf_writer = pa.BufferOutputStream()
            self.writer = ArrowWriter(features=self.info.features,
                                      stream=self.buf_writer,
                                      writer_batch_size=self.writer_batch_size)
        else:
            self.buf_writer = None

            # Get cache file name and lock it
            if self.cache_file_name is None or self.filelock is None:
                cache_file_name, filelock = self._create_cache_file()
                self.cache_file_name = cache_file_name
                self.filelock = filelock

            self.writer = ArrowWriter(features=self.info.features,
                                      path=self.cache_file_name,
                                      writer_batch_size=self.writer_batch_size)

    def _info(self) -> MetricInfo:
        """Construct the MetricInfo object. See `MetricInfo` for details.

        Warning: This function is only called once and the result is cached for all
        following .info() calls.

        Returns:
            info: (MetricInfo) The metrics information
        """
        raise NotImplementedError

    def download_and_prepare(
        self,
        download_config: Optional[DownloadConfig] = None,
        dl_manager: Optional[DownloadManager] = None,
        **download_and_prepare_kwargs,
    ):
        """Downloads and prepares dataset for reading.

        Args:
            download_config (Optional ``nlp.DownloadConfig``: specific download configuration parameters.
            dl_manager (Optional ``nlp.DownloadManager``): specific Download Manger to use
        """
        if dl_manager is None:
            if download_config is None:
                download_config = DownloadConfig()
                download_config.cache_dir = os.path.join(
                    self.data_dir, "downloads")
                download_config.force_download = False

            dl_manager = DownloadManager(dataset_name=self.name,
                                         download_config=download_config,
                                         data_dir=self.data_dir)

        self._download_and_prepare(dl_manager)

    def _download_and_prepare(self, dl_manager):
        """Downloads and prepares resources for the metric.

        This is the internal implementation to overwrite called when user calls
        `download_and_prepare`. It should download all required resources for the metric.

        Args:
            dl_manager: (DownloadManager) `DownloadManager` used to download and cache
                data..
        """
        return None

    def _compute(self,
                 *,
                 predictions=None,
                 references=None,
                 **kwargs) -> Dict[str, Any]:
        """ This method defines the common API for all the metrics in the library """
        raise NotImplementedError