def test_read_parquet(self): file_path = os.path.join(FILES_PATH, "test.parquet") ds = DataSource(format="parquet", source=file_path) df = ds.to_dataframe().compute() self.assertTrue("reviewerID" in df.columns) self.assertTrue("path" in df.columns)
def test_no_mapping(self): ds = DataSource( format="json", source=os.path.join(FILES_PATH, "dataset_source.jsonl") ) with pytest.raises(ValueError): ds.to_mapped_dataframe()
def test_flatten_json(self): file_path = os.path.join(FILES_PATH, "to-be-flattened.jsonl") ds = DataSource(format="json", flatten=True, source=file_path) df = ds.to_dataframe().compute() for c in ["persons.*.lastName", "persons.*.name"]: self.assertIn(c, df.columns, f"Expected {c} as column name")
def test_reader_csv_with_leading_and_trailing_spaces_in_examples(self): ds = DataSource( format="csv", source=os.path.join(FILES_PATH, "trailing_coma_in_headers.csv"), sep=";", ) df = ds.to_dataframe().compute() self.assertIn("name", df.columns)
def test_read_csv(self): file_path = os.path.join(FILES_PATH, "dataset_source.csv") datasource = DataSource(format="csv", source=file_path) data_frame = datasource.to_dataframe().compute() assert len(data_frame) > 0 self.assertTrue("path" in data_frame.columns)
def test_flatten_nested_list(self): file_path = os.path.join(FILES_PATH, "nested-list.jsonl") ds = DataSource(format="json", flatten=True, source=file_path) df = ds.to_dataframe().compute() for c in [ "classification.*.origin.*.key", "classification.*.origin.*.source" ]: self.assertIn(c, df.columns, f"Expected {c} as data column")
def test_add_mock_format(self): def ds_parser(*args, **kwargs): from dask import dataframe as ddf import pandas as pd return ddf.from_pandas( pd.DataFrame([i for i in range(0, 100)]), npartitions=1 ) DataSource.add_supported_format("new-format", ds_parser) self.assertFalse( DataSource(source="source", format="new-format").to_dataframe().columns is None )
def training_data_source(tmp_path) -> DataSource: data_file = tmp_path / "train.json" df = pd.DataFrame({ "text": [ "This is a simple NER test", "This is a simple NER test with misaligned spans", "No NER here", ], "labels": [ [{ "start": 17, "end": 20, "label": "NER" }], [{ "start": 17, "end": 22, "label": "NER" }], [], ], }) df.to_json(data_file, lines=True, orient="records") return DataSource(source=str(data_file), flatten=False, lines=True, orient="records")
def test_load_multiple_formats(self): files = [ os.path.join(FILES_PATH, "dataset_source.jsonl"), os.path.join(FILES_PATH, "dataset_source.csv"), ] with pytest.raises(TypeError): DataSource(source=files)
def test_load_pipeline_with_custom_head(): config = PipelineConfiguration( "test-pipeline", head=TaskHeadConfiguration( type=MyCustomHead, labels=[ "blue-collar", "technician", "management", "services", "retired", "admin.", ], ), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(config) assert isinstance(pipeline.head, MyCustomHead) train = DataSource( source=os.path.join(TEST_RESOURCES, "resources/data/dataset_source.csv"), mapping={ "label": "job", "text": ["education", "marital"] }, ) output = mkdtemp() pipeline.create_vocabulary(VocabularyConfiguration(sources=[train])) pipeline.train(output=output, training=train) trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz")) trained_pl.predict("Oh yeah") assert isinstance(trained_pl.head, MyCustomHead)
def explore( self, data_source: DataSource, explore_id: Optional[str] = None, es_host: Optional[str] = None, batch_size: int = 50, prediction_cache_size: int = 0, explain: bool = False, force_delete: bool = True, **metadata, ) -> dd.DataFrame: """Launches the Explore UI for a given data source Running this method inside an `IPython` notebook will try to render the UI directly in the notebook. Running this outside a notebook will try to launch the standalone web application. Parameters ---------- data_source: `DataSource` The data source or its yaml file path explore_id: `Optional[str]` A name or id for this explore run, useful for running and keep track of several explorations es_host: `Optional[str]` The URL to the Elasticsearch host for indexing predictions (default is `localhost:9200`) batch_size: `int` The batch size for indexing predictions (default is `500) prediction_cache_size: `int` The size of the cache for caching predictions (default is `0) explain: `bool` Whether to extract and return explanations of token importance (default is `False`) force_delete: `bool` Deletes exploration with the same `explore_id` before indexing the new explore items (default is `True) Returns ------- pipeline: `Pipeline` A configured pipeline """ from ._helpers import _explore, _show_explore config = ExploreConfiguration( batch_size=batch_size, prediction_cache_size=prediction_cache_size, explain=explain, force_delete=force_delete, **metadata, ) es_config = ElasticsearchExplore( es_index=explore_id or str(uuid.uuid1()), es_host=es_host or constants.DEFAULT_ES_HOST, ) if not data_source.mapping: data_source.mapping = self._model._default_ds_mapping explore_df = _explore(self, data_source, config, es_config) _show_explore(es_config) return explore_df
def train_data_source() -> DataSource: source = (Path(__file__).parent.parent / "resources" / "data" / "emotions_with_transformers.txt") training_ds = DataSource(source=str(source), format="csv", sep=";", names=["text", "label"]) return training_ds
def test_to_mapped(self): the_mapping = {"label": "overall", "tokens": "summary"} for ds in [ DataSource( format="json", mapping=the_mapping, source=os.path.join(FILES_PATH, "dataset_source.jsonl"), ), DataSource( source=os.path.join(FILES_PATH, "dataset_source.jsonl"), mapping=the_mapping, ), ]: df = ds.to_mapped_dataframe() self.assertIn("label", df.columns) self.assertIn("tokens", df.columns)
def create_dataset(self, datasource: DataSource, lazy: bool = False) -> InstancesDataset: """ Creates an instances torch Dataset from an data source Parameters ---------- datasource: The source of data lazy: If enabled, the returned dataset is a subclass of `torch.data.utils.IterableDataset` Returns ------- A torch Dataset containing the instances collection """ mapping = {k: k for k in self.inputs + [self.output] if k} mapping.update(datasource.mapping) datasource.mapping = mapping ddf = datasource.to_mapped_dataframe() instances_series: "dask.dataframe.core.Series" = ddf.map_partitions( lambda df: df.apply( lambda row: self.head.featurize(**row.to_dict()), axis=1), meta=object, ).persist() # We remove the not featurizable examples from the data set. The head should log a warning for them though! instances_series = instances_series.dropna() def build_instance_generator(instances: DataFrame): """Configures an instance generator from DataFrame""" def instance_generator(path: str) -> Iterable[Instance]: yield from instances return instance_generator return (AllennlpLazyDataset( instance_generator=build_instance_generator(instances_series), file_path="dummy", ) if lazy else AllennlpDataset(list(instances_series.compute())))
def test_lazy_dataset_creation(pipeline_test: Pipeline, datasource_test: DataSource): df = datasource_test.to_dataframe() dataset = pipeline_test.create_dataset(datasource_test, lazy=True) assert isinstance(dataset, AllennlpLazyDataset) assert len([x for x in dataset]) == len(df.text) for instance in dataset: assert isinstance(instance, Instance) assert "text" in instance.fields assert "label" in instance.fields
def datasource_test(tmp_path) -> DataSource: data_file = tmp_path / "classifier.parquet" df = pd.DataFrame( { "text": ["A common text", "This is why you get", "Seriosly?, I'm not sure"], "label": ["one", "zero", "zero"], } ) df.to_parquet(data_file) return DataSource(source=str(data_file))
def test_dataset_creation_with_partial_mapping( datasource_with_partial_mapping: DataSource, pipeline_test: Pipeline ): df = datasource_with_partial_mapping.to_mapped_dataframe() dataset = pipeline_test.create_dataset(datasource_with_partial_mapping) assert isinstance(dataset, AllennlpDataset) assert len(dataset) == len(df.text) for instance in dataset: assert isinstance(instance, Instance) assert "text" in instance.fields assert "label" in instance.fields
def training_data_source(tmp_path) -> DataSource: data_file = tmp_path / "relations.json" df = pd.DataFrame([ { "text": "The most common audits were about waste and recycling.", "entities": [ { "start": 34, "end": 39, "label": "PN", "text": "waste" }, { "start": 16, "end": 22, "label": "QTY", "text": "audits" }, ], "label": "Message-Topic(e1,e2)", }, { "text": "The company fabricates plastic chairs.", "entities": [ { "start": 4, "end": 11, "label": "OBJECT", "text": "company" }, { "start": 31, "end": 37, "label": "SUBJECT", "text": "chairs" }, ], "label": "Product-Producer(e2,e1)", }, ]) df.to_json(data_file, lines=True, orient="records") return DataSource(source=str(data_file), flatten=False, lines=True, orient="records")
def datasource_with_partial_mapping(tmp_path) -> DataSource: data_file = tmp_path / "classifier.parquet" df = pd.DataFrame( { "another_text": [ "A common text", "This is why you get", "Seriosly?, I'm not sure", ], "label": ["one", "zero", "zero"], } ) df.to_parquet(data_file) return DataSource(source=str(data_file), mapping={"text": "another_text"})
def training_data_source(tmp_path) -> DataSource: data_file = tmp_path / "record_pairs.json" df = pd.DataFrame({ "text": [ "this is a text", "my name is dani", "this is a table", "my name is paco", ], }) df.to_json(data_file, lines=True, orient="records") return DataSource(source=str(data_file), flatten=False, lines=True, orient="records")
def training_data_source(tmp_path) -> DataSource: data_file = tmp_path / "record_pairs.json" df = pd.DataFrame({ "record1": [ { "@fist_name": "Hans", "@last_name": "Peter" }, { "@fist_name": "Heinrich", "@last_name": "Meier" }, { "@fist_name": "Hans", "@last_name": "Peter" }, ], "record2": [ { "@fist_name": "Hans", "@last_name": "Petre" }, { "@fist_name": "Heinz", "@last_name": "Meier" }, { "@fist_name": "Hansel", "@last_name": "Peter" }, ], "label": ["duplicate", "not_duplicate", "duplicate"], }) df.to_json(data_file, lines=True, orient="records") return DataSource(source=str(data_file), flatten=False, lines=True, orient="records")
def _explore( pipeline: Pipeline, data_source: DataSource, config: ExploreConfiguration, elasticsearch: ElasticsearchExplore, ) -> dd.DataFrame: """ Executes a pipeline prediction over a datasource and register results int a elasticsearch index Parameters ---------- pipeline data_source config elasticsearch Returns ------- """ if config.prediction_cache > 0: pipeline.init_prediction_cache(config.prediction_cache) ddf_mapped = data_source.to_mapped_dataframe() # Stringify input data for better elasticsearch index mapping integration, # avoiding properties with multiple value types (string and long,...) for column in ddf_mapped.columns: ddf_mapped[column] = ddf_mapped[column].apply(helpers.stringify) # this only makes really sense when we have a predict_batch_json method implemented ... n_partitions = max(1, round(len(ddf_mapped) / config.batch_size)) apply_func = pipeline.explain_batch if config.explain else pipeline.predict_batch def annotate_batch(df: pd.DataFrame): """Applies data annotation at batch level""" input_batch = df.to_dict(orient="records") predictions = apply_func(input_batch) return pd.Series(map(sanitize, predictions), index=df.index) # a persist is necessary here, otherwise it fails for n_partitions == 1 # the reason is that with only 1 partition we pass on a generator to predict_batch_json ddf_mapped: dd.DataFrame = ddf_mapped.repartition( npartitions=n_partitions).persist() ddf_mapped["annotation"] = ddf_mapped.map_partitions(annotate_batch, meta=(None, object)) ddf_source = (data_source.to_dataframe().repartition( npartitions=n_partitions).persist()) # Keep as metadata only non used values/columns ddf_source = ddf_source[[ c for c in ddf_source.columns if c not in ddf_mapped.columns ]] ddf_mapped["metadata"] = ddf_source.map_partitions( lambda df: helpers.stringify(sanitize(df.to_dict(orient="records")))) ddf = DaskElasticClient(host=elasticsearch.es_host, retry_on_timeout=True, http_compress=True).save( ddf_mapped, index=elasticsearch.es_index, doc_type=elasticsearch.es_doc) elasticsearch.create_explore_data_index(force_delete=config.force_delete) elasticsearch.create_explore_data_record({ **(config.metadata or {}), "datasource": data_source.source, # TODO: This should change when ui is normalized (action detail and action link naming) "explore_name": elasticsearch.es_index, "model": pipeline.name, "columns": ddf.columns.values.tolist(), "metadata_columns": data_source.to_dataframe().columns.values.tolist(), "pipeline": pipeline.type_name, "output": pipeline.output, "inputs": pipeline.inputs, # backward compatibility "signature": pipeline.inputs + [pipeline.output], "predict_signature": pipeline.inputs, "labels": pipeline.head.labels, "task": pipeline.head.task_name().as_string(), }) return ddf.persist()
def train_data_source() -> DataSource: resources_path = Path(__file__).parent.parent / "resources" / "data" training_ds = DataSource(source=str(resources_path / "business.cat.2k.train.csv")) return training_ds
def train_valid_data_source() -> Tuple[DataSource, DataSource]: resources_path = Path(__file__).parent.parent / "resources" / "data" training_ds = DataSource(source=str(resources_path / "business.cat.2k.train.csv")) validation_ds = DataSource(source=str(resources_path / "business.cat.2k.valid.csv")) return training_ds, validation_ds
def explore(pipeline_path: str, data_source: str, explain: bool, es_host: str) -> None: Pipeline.from_pretrained(pipeline_path).explore( data_source=DataSource.from_yaml(data_source), es_host=es_host, explain=explain)
def test_override_format(self): with pytest.raises(TypeError): DataSource(source=os.path.join(FILES_PATH, "*.jsonl"), format="not-found")
def test_wrong_format(self): with pytest.raises(MissingArgumentError): DataSource(format="not-found") # New format with pytest.raises(TypeError): DataSource(source="not-found")