Exemple #1
0
    def _collect_as_arrow(self):
        """
        Returns all records as a list of ArrowRecordBatches, pyarrow must be installed
        and available on driver and worker Python environments.

        .. note:: Experimental.
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        with SCCallSiteSync(self._sc):
            port, auth_secret, jsocket_auth_server = self._jdf.collectAsArrowToPython(
            )

        # Collect list of un-ordered batches where last element is a list of correct order indices
        try:
            results = list(
                _load_from_socket((port, auth_secret),
                                  ArrowCollectSerializer()))
        finally:
            # Join serving thread and raise any exceptions from collectAsArrowToPython
            jsocket_auth_server.getResult()

        # Separate RecordBatches from batch order indices in results
        batches = results[:-1]
        batch_order = results[-1]

        # Re-order the batch list using the correct order
        return [batches[i] for i in batch_order]
Exemple #2
0
    def get_data_frame(self, name: str) -> pandas.DataFrame:
        """Get dataset with given name as a pandas dataframe.

    Raises ValueError if the specified dataset does not exist.
    """
        import pyarrow as pa  # type: ignore
        from pyspark.rdd import _load_from_socket  # type: ignore
        from pyspark.sql.pandas.serializers import ArrowCollectSerializer  # type: ignore
        name = name.lower()
        if name not in self.artifacts:
            raise ValueError("Unknown artifact: '{}'".format(name))
        if self.artifacts[name].artifact_type != ARTIFACT_TYPE_DATASET:
            raise ValueError("Artifact '{}' is not a dataset".format(name))

        response = self.vizier_request("get_data_frame",
                                       name=name,
                                       includeUncertainty=True,
                                       has_response=True)
        assert (response is not None)
        results = list(
            _load_from_socket((response['port'], response['secret']),
                              ArrowCollectSerializer()))
        batches = results[:-1]
        batch_order = results[-1]
        ordered_batches = [batches[i] for i in batch_order]
        if len(ordered_batches) > 0:
            table = pa.Table.from_batches(ordered_batches)
            return table.to_pandas()
        else:
            raise Exception(
                "Error loading dataframe '{}'.  It has no content.".format(
                    name))
Exemple #3
0
    def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch"]:
        """
        Returns all records as a list of ArrowRecordBatches, pyarrow must be installed
        and available on driver and worker Python environments.
        This is an experimental feature.

        :param split_batches: split batches such that each column is in its own allocation, so
            that the selfDestruct optimization is effective; default False.

        .. note:: Experimental.
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        with SCCallSiteSync(self._sc):
            (
                port,
                auth_secret,
                jsocket_auth_server,
            ) = self._jdf.collectAsArrowToPython()

        # Collect list of un-ordered batches where last element is a list of correct order indices
        try:
            batch_stream = _load_from_socket((port, auth_secret), ArrowCollectSerializer())
            if split_batches:
                # When spark.sql.execution.arrow.pyspark.selfDestruct.enabled, ensure
                # each column in each record batch is contained in its own allocation.
                # Otherwise, selfDestruct does nothing; it frees each column as its
                # converted, but each column will actually be a list of slices of record
                # batches, and so no memory is actually freed until all columns are
                # converted.
                import pyarrow as pa

                results = []
                for batch_or_indices in batch_stream:
                    if isinstance(batch_or_indices, pa.RecordBatch):
                        batch_or_indices = pa.RecordBatch.from_arrays(
                            [
                                # This call actually reallocates the array
                                pa.concat_arrays([array])
                                for array in batch_or_indices
                            ],
                            schema=batch_or_indices.schema,
                        )
                    results.append(batch_or_indices)
            else:
                results = list(batch_stream)
        finally:
            # Join serving thread and raise any exceptions from collectAsArrowToPython
            jsocket_auth_server.getResult()

        # Separate RecordBatches from batch order indices in results
        batches = results[:-1]
        batch_order = results[-1]

        # Re-order the batch list using the correct order
        return [batches[i] for i in batch_order]
Exemple #4
0
    def get_dataset_frame(
            self,
            identifier: str,
            force_profiler: Optional[bool] = None) -> Optional[DataFrame]:
        import pyarrow as pa  #type: ignore
        from pyspark.rdd import _load_from_socket  #type: ignore
        from pyspark.sql.pandas.serializers import ArrowCollectSerializer  #type: ignore

        portSecret = mimir.getDataframe(
            query='SELECT * FROM {}'.format(identifier))
        results = list(
            _load_from_socket((portSecret['port'], portSecret['secret']),
                              ArrowCollectSerializer()))
        batches = results[:-1]
        batch_order = results[-1]
        ordered_batches = [batches[i] for i in batch_order]
        table = pa.Table.from_batches(ordered_batches)
        return table.to_pandas()