Exemple #1
0
    def explore(
        self,
        data_source: DataSource,
        explore_id: Optional[str] = None,
        es_host: Optional[str] = None,
        batch_size: int = 50,
        prediction_cache_size: int = 0,
        explain: bool = False,
        force_delete: bool = True,
        **metadata,
    ) -> dd.DataFrame:
        """Launches the Explore UI for a given data source

        Running this method inside an `IPython` notebook will try to render the UI directly in the notebook.

        Running this outside a notebook will try to launch the standalone web application.

        Parameters
        ----------
        data_source: `DataSource`
            The data source or its yaml file path
        explore_id: `Optional[str]`
            A name or id for this explore run, useful for running and keep track of several explorations
        es_host: `Optional[str]`
            The URL to the Elasticsearch host for indexing predictions (default is `localhost:9200`)
        batch_size: `int`
            The batch size for indexing predictions (default is `500)
        prediction_cache_size: `int`
            The size of the cache for caching predictions (default is `0)
        explain: `bool`
            Whether to extract and return explanations of token importance (default is `False`)
        force_delete: `bool`
            Deletes exploration with the same `explore_id` before indexing the new explore items (default is `True)

        Returns
        -------
        pipeline: `Pipeline`
            A configured pipeline
        """
        from ._helpers import _explore, _show_explore

        config = ExploreConfiguration(
            batch_size=batch_size,
            prediction_cache_size=prediction_cache_size,
            explain=explain,
            force_delete=force_delete,
            **metadata,
        )

        es_config = ElasticsearchExplore(
            es_index=explore_id or str(uuid.uuid1()),
            es_host=es_host or constants.DEFAULT_ES_HOST,
        )

        if not data_source.mapping:
            data_source.mapping = self._model._default_ds_mapping
        explore_df = _explore(self, data_source, config, es_config)
        _show_explore(es_config)

        return explore_df
Exemple #2
0
    def create_dataset(self,
                       datasource: DataSource,
                       lazy: bool = False) -> InstancesDataset:
        """
        Creates an instances torch Dataset from an data source

        Parameters
        ----------
        datasource:
            The source of data
        lazy:
            If enabled, the returned dataset is a subclass of `torch.data.utils.IterableDataset`

        Returns
        -------

        A torch Dataset containing the instances collection

        """
        mapping = {k: k for k in self.inputs + [self.output] if k}
        mapping.update(datasource.mapping)

        datasource.mapping = mapping
        ddf = datasource.to_mapped_dataframe()
        instances_series: "dask.dataframe.core.Series" = ddf.map_partitions(
            lambda df: df.apply(
                lambda row: self.head.featurize(**row.to_dict()), axis=1),
            meta=object,
        ).persist()
        # We remove the not featurizable examples from the data set. The head should log a warning for them though!
        instances_series = instances_series.dropna()

        def build_instance_generator(instances: DataFrame):
            """Configures an instance generator from DataFrame"""
            def instance_generator(path: str) -> Iterable[Instance]:
                yield from instances

            return instance_generator

        return (AllennlpLazyDataset(
            instance_generator=build_instance_generator(instances_series),
            file_path="dummy",
        ) if lazy else AllennlpDataset(list(instances_series.compute())))