def test_from_arrow_schema_simple(self): data = {"a": [{"b": {"c": "text"}}] * 10, "foo": [1] * 10} original_features = Features({ "a": { "b": { "c": Value("string") } }, "foo": Value("int64") }) dset = Dataset.from_dict(data, features=original_features) new_features = Features.from_arrow_schema(dset.schema) new_dset = Dataset.from_dict(data, features=new_features) self.assertDictEqual(dset[0], new_dset[0]) self.assertDictEqual(dset[:], new_dset[:])
def test_keep_features_after_transform_from_file(self): features = Features( {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))} ) dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features) def invert_labels(x): return {"labels": [(1 - label) for label in x["labels"]]} with tempfile.TemporaryDirectory() as tmp_dir: tmp_file = os.path.join(tmp_dir, "test.arrow") dset.map(invert_labels, cache_file_name=tmp_file) inverted_dset = Dataset.from_file(tmp_file) self.assertEqual(inverted_dset.features.type, features.type) self.assertDictEqual(inverted_dset.features, features)
def test_from_dict(self): data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} dset = Dataset.from_dict(data) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) features = Features({"col_1": Value("int64"), "col_2": Value("string")}) dset = Dataset.from_dict(data, features=features) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) features = Features({"col_1": Value("string"), "col_2": Value("string")}) self.assertRaises(pa.ArrowTypeError, Dataset.from_dict, data, features=features)
def test_flatten(self): dset = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset.flatten() self.assertListEqual(dset.column_names, ["a.b.c", "foo"]) self.assertListEqual(list(dset.features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset.features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") }))
def _create_dummy_dataset(self) -> Dataset: dset = Dataset( pa.Table.from_pydict({ "filename": [ "my_name-train" + "_" + str(x) for x in np.arange(30).tolist() ] })) return dset
def test_concatenate(self): data1, data2, data3 = { "id": [0, 1, 2] }, { "id": [3, 4, 5] }, { "id": [6, 7] } dset1, dset2, dset3 = Dataset.from_dict(data1), Dataset.from_dict( data2), Dataset.from_dict(data3) dset1._info = DatasetInfo(description="Dataset1") dset2._info = DatasetInfo(description="Dataset2") dset3._info = None dset_concat = concatenate_datasets([dset1, dset2, dset3]) self.assertEquals(len(dset_concat), len(dset1) + len(dset2) + len(dset3)) self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2")
def test_concatenate(self): data1, data2, data3 = { "id": [0, 1, 2] }, { "id": [3, 4, 5] }, { "id": [6, 7] } info1 = DatasetInfo(description="Dataset1") info2 = DatasetInfo(description="Dataset2") dset1, dset2, dset3 = ( Dataset.from_dict(data1, info=info1), Dataset.from_dict(data2, info=info2), Dataset.from_dict(data3), ) dset_concat = concatenate_datasets([dset1, dset2, dset3]) self.assertEquals(len(dset_concat), len(dset1) + len(dset2) + len(dset3)) self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2\n\n")
def test_keep_features_after_transform_in_memory(self): features = Features( {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))} ) dset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features) def invert_labels(x): return {"labels": [(1 - label) for label in x["labels"]]} inverted_dset = dset.map(invert_labels, keep_in_memory=True) self.assertEqual(inverted_dset.features.type, features.type) self.assertDictEqual(inverted_dset.features, features)
def _create_dummy_dataset(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=30) test_info = SplitInfo(name="test", num_examples=30) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) reader = ReaderTester("", info) dset = Dataset(**reader.read(name, "train", split_infos)) return dset
def test_read_files(self): train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) reader = ReaderTest("", info) files = [{ "filename": "train" }, { "filename": "test", "skip": 10, "take": 10 }] dset = Dataset(**reader.read_files(files, original_instructions="")) self.assertEqual(dset.num_rows, 110) self.assertEqual(dset.num_columns, 1) self.assertEqual(dset._data_files, files)
def _create_dummy_dataset(self): dset = Dataset.from_dict({ "filename": ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()] }) return dset
class Metric(object): def __init__( self, keep_in_memory: bool = False, data_dir: Optional[str] = None, num_process: int = 1, process_id: int = 0, seed: Optional[int] = None, config_name: Optional[str] = None, experiment_id: Optional[str] = None, max_concurrent_cache_files: int = 10000, **kwargs, ): """A Metrics is the base class and common API for all metrics. Args: keep_in_memory (``bool``): keep all predictions and references in memory. Not possible in distributed settings. data_dir (``str``): Path to a directory in which temporary prediction/references data will be stored. The data directory should be located on a shared file-system in distributed setups. num_process (``int``): specify the total number of nodes in a distributed settings. This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1). process_id (``int``): specify the id of the current process in a distributed setup (between 0 and num_process-1) This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1). seed (Optional ``int``): If specified, this will temporarily set numpy's random seed when :func:`nlp.Metric.compute` is run. config_name (``str``): This is used to define a hash specific to a metrics computation script and prevents the metric's data to be overridden when the metric loading script is modified. experiment_id (``str``): A specific experiment id. This is used if several distributed evaluations share the same file system. This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1). max_concurrent_cache_files (``int``): Max number of concurrent metrics cache files (default 10000). """ # Metric name self.name = camelcase_to_snakecase(self.__class__.__name__) # Configuration name self.config_name: str = config_name or "default_config" # Experiment id self.experiment_id: str = experiment_id or "default_experiment" # Safety checks on num_process and process_id assert isinstance( process_id, int ) and process_id >= 0, "'process_id' should be a number greater than 0" assert (isinstance(num_process, int) and num_process > process_id ), "'num_process' should be a number greater than process_id" assert ( num_process == 1 or not keep_in_memory ), "Using 'keep_in_memory' is not possible in distributed setting (num_process > 1)." self.num_process = num_process self.process_id = process_id self.max_concurrent_cache_files = max_concurrent_cache_files self.keep_in_memory = keep_in_memory self._data_dir_root = os.path.expanduser(data_dir or HF_METRICS_CACHE) self.data_dir = self._build_data_dir() self.seed: int = seed or np.random.get_state()[1][0] # prepare info info = self._info() info.metric_name = self.name info.config_name = self.config_name info.experiment_id = self.experiment_id self.info = info # Update 'compute' and 'add' docstring # methods need to be copied otherwise it changes the docstrings of every instance self.compute = types.MethodType(copyfunc(self.compute), self) self.add_batch = types.MethodType(copyfunc(self.add_batch), self) self.add = types.MethodType(copyfunc(self.add), self) self.compute.__func__.__doc__ += self.info.inputs_description self.add_batch.__func__.__doc__ += self.info.inputs_description self.add.__func__.__doc__ += self.info.inputs_description # self.arrow_schema = pa.schema(field for field in self.info.features.type) self.buf_writer = None self.writer = None self.writer_batch_size = None self.data = None # This is the cache file we store our predictions/references in # Keep it None for now so we can (cloud)pickle the object self.cache_file_name = None self.filelock = None # This is all the cache files on which we have a lock when we are in a distributed setting self.file_paths = None self.filelocks = None def _build_data_dir(self): """Path of this metric in cache_dir: Will be: self._data_dir_root/self.name/self.config_name/self.hash (if not none)/ If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped. """ builder_data_dir = self._data_dir_root builder_data_dir = os.path.join(builder_data_dir, self.name, self.config_name) os.makedirs(builder_data_dir, exist_ok=True) return builder_data_dir def _create_cache_file(self, timeout=1) -> Tuple[str, FileLock]: """ Create a new cache file. If the default cache file is used, we generated a new hash. """ file_path = os.path.join( self.data_dir, f"{self.experiment_id}-{self.num_process}-{self.process_id}.arrow") for i in range(self.max_concurrent_cache_files): filelock = FileLock(file_path + ".lock") try: filelock.acquire(timeout=timeout) except Timeout: # If we have reached the max number of attempts or we are not allow to find a free name (distributed setup) # We raise an error if self.num_process != 1: raise ValueError( f"Another metric instance is already using the local cache file at {file_path}. " f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid colision " f"between distributed metric instances.") if i == self.max_concurrent_cache_files - 1: raise ValueError( f"Cannot acquire lock, too many metric instance are operating concurrently on this file system." f"You should set a larger value of max_concurrent_cache_files when creating the metric " f"(current value is {self.max_concurrent_cache_files})." ) # In other cases (allow to find new file name + not yet at max num of attempts) we can try to sample a new hashing name. file_uuid = str(uuid.uuid4()) file_path = os.path.join( self.data_dir, f"{self.experiment_id}-{file_uuid}-{self.num_process}-{self.process_id}.arrow" ) else: break return file_path, filelock def _get_all_cache_files(self, timeout=100) -> Tuple[List[str], List[FileLock]]: """Get a lock on all the cache files in a distributed setup. We wait for timeout second to let all the distributed node finish their tasks (default is 100 seconds). """ if self.num_process == 1: file_paths = [self.cache_file_name] else: file_paths = [ os.path.join( self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow" ) for process_id in range(self.num_process) ] # Let's acquire a lock on each process files to be sure they are finished writing filelocks = [] for process_id, file_path in enumerate(file_paths): filelock = FileLock(file_path + ".lock") try: filelock.acquire(timeout=timeout) except Timeout: raise ValueError( f"Cannot acquire lock on cached file {file_path} for process {process_id}." ) else: filelocks.append(filelock) return file_paths, filelocks def finalize(self, timeout=100): """Close all the writing process and load/gather the data from all the nodes if main node or all_process is True. """ if self.writer is not None: self.writer.finalize() self.writer = None if self.filelock is not None: self.filelock.release() if self.keep_in_memory: # Read the predictions and references reader = ArrowReader(path=self.data_dir, info=None) self.data = Dataset.from_buffer(self.buf_writer.getvalue()) elif self.process_id == 0: # Let's acquire a lock on each node files to be sure they are finished writing file_paths, filelocks = self._get_all_cache_files(timeout=timeout) # Read the predictions and references try: reader = ArrowReader(path=self.data_dir, info=None) self.data = Dataset(**reader.read_files([{ "filename": f } for f in file_paths])) except FileNotFoundError: raise ValueError( "Another metric instance is already using the local cache file. " "Please specify an experiment_id to avoid colision between distributed metric instances." ) # Store file paths and locks and we will release/delete them after the computation. self.file_paths = file_paths self.filelocks = filelocks def compute(self, *args, **kwargs) -> Optional[dict]: """Compute the metrics. Args: We disallow the usage of positional arguments to prevent mistakes `predictions` (Optional list/array/tensor): predictions `references` (Optional list/array/tensor): references `timeout` (Optional int): timeout for distributed gathering of values on several nodes `**kwargs` (Optional other kwargs): will be forwared to the metrics :func:`_compute` method (see details in the docstring) Return: Dictionnary with the metrics if this metric is run on the main process (process_id == 0) None if the metric is not run on the main process (process_id != 0) """ if args: raise ValueError("Please call `compute` using keyword arguments.") predictions = kwargs.pop("predictions", None) references = kwargs.pop("references", None) timeout = kwargs.pop("timeout", 120) if predictions is not None: self.add_batch(predictions=predictions, references=references) self.finalize(timeout=timeout) self.cache_file_name = None self.filelock = None if self.process_id == 0: self.data.set_format(type=self.info.format) predictions = self.data["predictions"] references = self.data["references"] with temp_seed(self.seed): output = self._compute(predictions=predictions, references=references, **kwargs) if self.buf_writer is not None: self.buf_writer = None else: # Release locks and delete all the cache files for filelock, file_path in zip(self.filelocks, self.file_paths): logger.info(f"Removing {file_path}") os.remove(file_path) filelock.release() return output else: return None def add_batch(self, *, predictions=None, references=None): """ Add a batch of predictions and references for the metric's stack. """ batch = {"predictions": predictions, "references": references} batch = self.info.features.encode_batch(batch) if self.writer is None: self._init_writer() self.writer.write_batch(batch) def add(self, *, prediction=None, reference=None): """Add one prediction and reference for the metric's stack.""" example = {"predictions": prediction, "references": reference} example = self.info.features.encode_example(example) if self.writer is None: self._init_writer() self.writer.write(example) def _init_writer(self): if self.keep_in_memory: self.buf_writer = pa.BufferOutputStream() self.writer = ArrowWriter(features=self.info.features, stream=self.buf_writer, writer_batch_size=self.writer_batch_size) else: self.buf_writer = None # Get cache file name and lock it if self.cache_file_name is None or self.filelock is None: cache_file_name, filelock = self._create_cache_file() self.cache_file_name = cache_file_name self.filelock = filelock self.writer = ArrowWriter(features=self.info.features, path=self.cache_file_name, writer_batch_size=self.writer_batch_size) def _info(self) -> MetricInfo: """Construct the MetricInfo object. See `MetricInfo` for details. Warning: This function is only called once and the result is cached for all following .info() calls. Returns: info: (MetricInfo) The metrics information """ raise NotImplementedError def download_and_prepare( self, download_config: Optional[DownloadConfig] = None, dl_manager: Optional[DownloadManager] = None, **download_and_prepare_kwargs, ): """Downloads and prepares dataset for reading. Args: download_config (Optional ``nlp.DownloadConfig``: specific download configuration parameters. dl_manager (Optional ``nlp.DownloadManager``): specific Download Manger to use """ if dl_manager is None: if download_config is None: download_config = DownloadConfig() download_config.cache_dir = os.path.join( self.data_dir, "downloads") download_config.force_download = False dl_manager = DownloadManager(dataset_name=self.name, download_config=download_config, data_dir=self.data_dir) self._download_and_prepare(dl_manager) def _download_and_prepare(self, dl_manager): """Downloads and prepares resources for the metric. This is the internal implementation to overwrite called when user calls `download_and_prepare`. It should download all required resources for the metric. Args: dl_manager: (DownloadManager) `DownloadManager` used to download and cache data.. """ return None def _compute(self, *, predictions=None, references=None, **kwargs) -> Dict[str, Any]: """ This method defines the common API for all the metrics in the library """ raise NotImplementedError