def explore( self, data_source: DataSource, explore_id: Optional[str] = None, es_host: Optional[str] = None, batch_size: int = 50, prediction_cache_size: int = 0, explain: bool = False, force_delete: bool = True, **metadata, ) -> dd.DataFrame: """Launches the Explore UI for a given data source Running this method inside an `IPython` notebook will try to render the UI directly in the notebook. Running this outside a notebook will try to launch the standalone web application. Parameters ---------- data_source: `DataSource` The data source or its yaml file path explore_id: `Optional[str]` A name or id for this explore run, useful for running and keep track of several explorations es_host: `Optional[str]` The URL to the Elasticsearch host for indexing predictions (default is `localhost:9200`) batch_size: `int` The batch size for indexing predictions (default is `500) prediction_cache_size: `int` The size of the cache for caching predictions (default is `0) explain: `bool` Whether to extract and return explanations of token importance (default is `False`) force_delete: `bool` Deletes exploration with the same `explore_id` before indexing the new explore items (default is `True) Returns ------- pipeline: `Pipeline` A configured pipeline """ from ._helpers import _explore, _show_explore config = ExploreConfiguration( batch_size=batch_size, prediction_cache_size=prediction_cache_size, explain=explain, force_delete=force_delete, **metadata, ) es_config = ElasticsearchExplore( es_index=explore_id or str(uuid.uuid1()), es_host=es_host or constants.DEFAULT_ES_HOST, ) if not data_source.mapping: data_source.mapping = self._model._default_ds_mapping explore_df = _explore(self, data_source, config, es_config) _show_explore(es_config) return explore_df
def create_dataset(self, datasource: DataSource, lazy: bool = False) -> InstancesDataset: """ Creates an instances torch Dataset from an data source Parameters ---------- datasource: The source of data lazy: If enabled, the returned dataset is a subclass of `torch.data.utils.IterableDataset` Returns ------- A torch Dataset containing the instances collection """ mapping = {k: k for k in self.inputs + [self.output] if k} mapping.update(datasource.mapping) datasource.mapping = mapping ddf = datasource.to_mapped_dataframe() instances_series: "dask.dataframe.core.Series" = ddf.map_partitions( lambda df: df.apply( lambda row: self.head.featurize(**row.to_dict()), axis=1), meta=object, ).persist() # We remove the not featurizable examples from the data set. The head should log a warning for them though! instances_series = instances_series.dropna() def build_instance_generator(instances: DataFrame): """Configures an instance generator from DataFrame""" def instance_generator(path: str) -> Iterable[Instance]: yield from instances return instance_generator return (AllennlpLazyDataset( instance_generator=build_instance_generator(instances_series), file_path="dummy", ) if lazy else AllennlpDataset(list(instances_series.compute())))