def test_no_mapping(self): ds = DataSource( format="json", source=os.path.join(FILES_PATH, "dataset_source.jsonl") ) with pytest.raises(ValueError): ds.to_mapped_dataframe()
def test_dataset_creation_with_partial_mapping( datasource_with_partial_mapping: DataSource, pipeline_test: Pipeline ): df = datasource_with_partial_mapping.to_mapped_dataframe() dataset = pipeline_test.create_dataset(datasource_with_partial_mapping) assert isinstance(dataset, AllennlpDataset) assert len(dataset) == len(df.text) for instance in dataset: assert isinstance(instance, Instance) assert "text" in instance.fields assert "label" in instance.fields
def create_dataset(self, datasource: DataSource, lazy: bool = False) -> InstancesDataset: """ Creates an instances torch Dataset from an data source Parameters ---------- datasource: The source of data lazy: If enabled, the returned dataset is a subclass of `torch.data.utils.IterableDataset` Returns ------- A torch Dataset containing the instances collection """ mapping = {k: k for k in self.inputs + [self.output] if k} mapping.update(datasource.mapping) datasource.mapping = mapping ddf = datasource.to_mapped_dataframe() instances_series: "dask.dataframe.core.Series" = ddf.map_partitions( lambda df: df.apply( lambda row: self.head.featurize(**row.to_dict()), axis=1), meta=object, ).persist() # We remove the not featurizable examples from the data set. The head should log a warning for them though! instances_series = instances_series.dropna() def build_instance_generator(instances: DataFrame): """Configures an instance generator from DataFrame""" def instance_generator(path: str) -> Iterable[Instance]: yield from instances return instance_generator return (AllennlpLazyDataset( instance_generator=build_instance_generator(instances_series), file_path="dummy", ) if lazy else AllennlpDataset(list(instances_series.compute())))
def _explore( pipeline: Pipeline, data_source: DataSource, config: ExploreConfiguration, elasticsearch: ElasticsearchExplore, ) -> dd.DataFrame: """ Executes a pipeline prediction over a datasource and register results int a elasticsearch index Parameters ---------- pipeline data_source config elasticsearch Returns ------- """ if config.prediction_cache > 0: pipeline.init_prediction_cache(config.prediction_cache) ddf_mapped = data_source.to_mapped_dataframe() # Stringify input data for better elasticsearch index mapping integration, # avoiding properties with multiple value types (string and long,...) for column in ddf_mapped.columns: ddf_mapped[column] = ddf_mapped[column].apply(helpers.stringify) # this only makes really sense when we have a predict_batch_json method implemented ... n_partitions = max(1, round(len(ddf_mapped) / config.batch_size)) apply_func = pipeline.explain_batch if config.explain else pipeline.predict_batch def annotate_batch(df: pd.DataFrame): """Applies data annotation at batch level""" input_batch = df.to_dict(orient="records") predictions = apply_func(input_batch) return pd.Series(map(sanitize, predictions), index=df.index) # a persist is necessary here, otherwise it fails for n_partitions == 1 # the reason is that with only 1 partition we pass on a generator to predict_batch_json ddf_mapped: dd.DataFrame = ddf_mapped.repartition( npartitions=n_partitions).persist() ddf_mapped["annotation"] = ddf_mapped.map_partitions(annotate_batch, meta=(None, object)) ddf_source = (data_source.to_dataframe().repartition( npartitions=n_partitions).persist()) # Keep as metadata only non used values/columns ddf_source = ddf_source[[ c for c in ddf_source.columns if c not in ddf_mapped.columns ]] ddf_mapped["metadata"] = ddf_source.map_partitions( lambda df: helpers.stringify(sanitize(df.to_dict(orient="records")))) ddf = DaskElasticClient(host=elasticsearch.es_host, retry_on_timeout=True, http_compress=True).save( ddf_mapped, index=elasticsearch.es_index, doc_type=elasticsearch.es_doc) elasticsearch.create_explore_data_index(force_delete=config.force_delete) elasticsearch.create_explore_data_record({ **(config.metadata or {}), "datasource": data_source.source, # TODO: This should change when ui is normalized (action detail and action link naming) "explore_name": elasticsearch.es_index, "model": pipeline.name, "columns": ddf.columns.values.tolist(), "metadata_columns": data_source.to_dataframe().columns.values.tolist(), "pipeline": pipeline.type_name, "output": pipeline.output, "inputs": pipeline.inputs, # backward compatibility "signature": pipeline.inputs + [pipeline.output], "predict_signature": pipeline.inputs, "labels": pipeline.head.labels, "task": pipeline.head.task_name().as_string(), }) return ddf.persist()