Beispiel #1
0
    def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
        """
        Executes the given partitionFunc on the specified set of partitions,
        returning the result as an array of elements.

        If 'partitions' is not specified, this will run over all partitions.

        >>> myRDD = sc.parallelize(range(6), 3)
        >>> sc.runJob(myRDD, lambda part: [x * x for x in part])
        [0, 1, 4, 9, 16, 25]

        >>> myRDD = sc.parallelize(range(6), 3)
        >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)
        [0, 1, 16, 25]
        """
        if partitions is None:
            partitions = range(rdd._jrdd.partitions().size())

        # Implementation note: This is implemented as a mapPartitions followed
        # by runJob() in order to avoid having to pass a Python lambda into
        # SparkContext#runJob.
        mappedRDD = rdd.mapPartitions(partitionFunc)
        port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions,
                                          allowLocal)
        return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
Beispiel #2
0
    def _collect_as_arrow(self):
        """
        Returns all records as a list of ArrowRecordBatches, pyarrow must be installed
        and available on driver and worker Python environments.

        .. note:: Experimental.
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        with SCCallSiteSync(self._sc):
            port, auth_secret, jsocket_auth_server = self._jdf.collectAsArrowToPython(
            )

        # Collect list of un-ordered batches where last element is a list of correct order indices
        try:
            results = list(
                _load_from_socket((port, auth_secret),
                                  ArrowCollectSerializer()))
        finally:
            # Join serving thread and raise any exceptions from collectAsArrowToPython
            jsocket_auth_server.getResult()

        # Separate RecordBatches from batch order indices in results
        batches = results[:-1]
        batch_order = results[-1]

        # Re-order the batch list using the correct order
        return [batches[i] for i in batch_order]
Beispiel #3
0
    def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
        """
        Executes the given partitionFunc on the specified set of partitions,
        returning the result as an array of elements.

        If 'partitions' is not specified, this will run over all partitions.

        >>> myRDD = sc.parallelize(range(6), 3)
        >>> sc.runJob(myRDD, lambda part: [x * x for x in part])
        [0, 1, 4, 9, 16, 25]

        >>> myRDD = sc.parallelize(range(6), 3)
        >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)
        [0, 1, 16, 25]
        """
        if partitions is None:
            partitions = range(rdd._jrdd.partitions().size())

        # Implementation note: This is implemented as a mapPartitions followed
        # by runJob() in order to avoid having to pass a Python lambda into
        # SparkContext#runJob.
        mappedRDD = rdd.mapPartitions(partitionFunc)
        port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions,
                                          allowLocal)
        return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
Beispiel #4
0
    def get_data_frame(self, name: str) -> pandas.DataFrame:
        """Get dataset with given name as a pandas dataframe.

    Raises ValueError if the specified dataset does not exist.
    """
        import pyarrow as pa  # type: ignore
        from pyspark.rdd import _load_from_socket  # type: ignore
        from pyspark.sql.pandas.serializers import ArrowCollectSerializer  # type: ignore
        name = name.lower()
        if name not in self.artifacts:
            raise ValueError("Unknown artifact: '{}'".format(name))
        if self.artifacts[name].artifact_type != ARTIFACT_TYPE_DATASET:
            raise ValueError("Artifact '{}' is not a dataset".format(name))

        response = self.vizier_request("get_data_frame",
                                       name=name,
                                       includeUncertainty=True,
                                       has_response=True)
        assert (response is not None)
        results = list(
            _load_from_socket((response['port'], response['secret']),
                              ArrowCollectSerializer()))
        batches = results[:-1]
        batch_order = results[-1]
        ordered_batches = [batches[i] for i in batch_order]
        if len(ordered_batches) > 0:
            table = pa.Table.from_batches(ordered_batches)
            return table.to_pandas()
        else:
            raise Exception(
                "Error loading dataframe '{}'.  It has no content.".format(
                    name))
Beispiel #5
0
    def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
        """
        Executes the given partitionFunc on the specified set of partitions,
        returning the result as an array of elements.

        If 'partitions' is not specified, this will run over all partitions.

        >>> myRDD = sc.parallelize(range(6), 3)
        >>> sc.runJob(myRDD, lambda part: [x * x for x in part])
        [0, 1, 4, 9, 16, 25]

        >>> myRDD = sc.parallelize(range(6), 3)
        >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)
        [0, 1, 16, 25]
        """
        if (testpy.SHIVAlog == 1):
            print("SHIVA LOG: Context.py in runJob()")
        if partitions is None:
            partitions = range(rdd._jrdd.partitions().size())

        # Implementation note: This is implemented as a mapPartitions followed
        # by runJob() in order to avoid having to pass a Python lambda into
        # SparkContext#runJob.
        print("SHIVA LOG: Context.py calling mapPartitions")
        mappedRDD = rdd.mapPartitions(partitionFunc)

        if (testpy.SHIVAlog == 1):
            print("SHIVA LOG: context.py , completed mapPartitions and calling jvm.runJob")
        print("SHIVA LOG: Context.py calling runJob")
        port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
        # print("time is " + str(testpy.filterTotalTime) + " " + str(RDD.testTime) + " " + str(testpy.c.timeT))
        print("SHIVA LOG: Context.py completed runJob")
        tmp = list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
        print("SHIVA LOG: RETURNED from _load_from_socket")
        return tmp
Beispiel #6
0
    def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch"]:
        """
        Returns all records as a list of ArrowRecordBatches, pyarrow must be installed
        and available on driver and worker Python environments.
        This is an experimental feature.

        :param split_batches: split batches such that each column is in its own allocation, so
            that the selfDestruct optimization is effective; default False.

        .. note:: Experimental.
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        with SCCallSiteSync(self._sc):
            (
                port,
                auth_secret,
                jsocket_auth_server,
            ) = self._jdf.collectAsArrowToPython()

        # Collect list of un-ordered batches where last element is a list of correct order indices
        try:
            batch_stream = _load_from_socket((port, auth_secret), ArrowCollectSerializer())
            if split_batches:
                # When spark.sql.execution.arrow.pyspark.selfDestruct.enabled, ensure
                # each column in each record batch is contained in its own allocation.
                # Otherwise, selfDestruct does nothing; it frees each column as its
                # converted, but each column will actually be a list of slices of record
                # batches, and so no memory is actually freed until all columns are
                # converted.
                import pyarrow as pa

                results = []
                for batch_or_indices in batch_stream:
                    if isinstance(batch_or_indices, pa.RecordBatch):
                        batch_or_indices = pa.RecordBatch.from_arrays(
                            [
                                # This call actually reallocates the array
                                pa.concat_arrays([array])
                                for array in batch_or_indices
                            ],
                            schema=batch_or_indices.schema,
                        )
                    results.append(batch_or_indices)
            else:
                results = list(batch_stream)
        finally:
            # Join serving thread and raise any exceptions from collectAsArrowToPython
            jsocket_auth_server.getResult()

        # Separate RecordBatches from batch order indices in results
        batches = results[:-1]
        batch_order = results[-1]

        # Re-order the batch list using the correct order
        return [batches[i] for i in batch_order]
Beispiel #7
0
    def collect(self):
        """Returns all the records as a list of :class:`Row`.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
Beispiel #8
0
    def collect(self):
        """Returns all the records as a list of :class:`Row`.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
Beispiel #9
0
    def take(self, num):
        """Returns the first ``num`` rows as a :class:`list` of :class:`Row`.

        >>> df.take(2)
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe(
                self._jdf, num)
        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
Beispiel #10
0
    def take(self, num):
        """Returns the first ``num`` rows as a :class:`list` of :class:`Row`.

        >>> df.take(2)
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe(
                self._jdf, num)
        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
Beispiel #11
0
    def collect(self):
        """Return a list that contains all of the rows.

        Each object in the list is a Row, the fields can be accessed as
        attributes.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
        rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
        cls = _create_cls(self.schema)
        return [cls(r) for r in rs]
Beispiel #12
0
    def collect(self):
        """Return a list that contains all of the rows.

        Each object in the list is a Row, the fields can be accessed as
        attributes.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
        rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
        cls = _create_cls(self.schema)
        return [cls(r) for r in rs]
Beispiel #13
0
    def get_dataset_frame(
            self,
            identifier: str,
            force_profiler: Optional[bool] = None) -> Optional[DataFrame]:
        import pyarrow as pa  #type: ignore
        from pyspark.rdd import _load_from_socket  #type: ignore
        from pyspark.sql.pandas.serializers import ArrowCollectSerializer  #type: ignore

        portSecret = mimir.getDataframe(
            query='SELECT * FROM {}'.format(identifier))
        results = list(
            _load_from_socket((portSecret['port'], portSecret['secret']),
                              ArrowCollectSerializer()))
        batches = results[:-1]
        batch_order = results[-1]
        ordered_batches = [batches[i] for i in batch_order]
        table = pa.Table.from_batches(ordered_batches)
        return table.to_pandas()