def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False): """ Executes the given partitionFunc on the specified set of partitions, returning the result as an array of elements. If 'partitions' is not specified, this will run over all partitions. >>> myRDD = sc.parallelize(range(6), 3) >>> sc.runJob(myRDD, lambda part: [x * x for x in part]) [0, 1, 4, 9, 16, 25] >>> myRDD = sc.parallelize(range(6), 3) >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True) [0, 1, 16, 25] """ if partitions is None: partitions = range(rdd._jrdd.partitions().size()) # Implementation note: This is implemented as a mapPartitions followed # by runJob() in order to avoid having to pass a Python lambda into # SparkContext#runJob. mappedRDD = rdd.mapPartitions(partitionFunc) port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions, allowLocal) return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
def _collect_as_arrow(self): """ Returns all records as a list of ArrowRecordBatches, pyarrow must be installed and available on driver and worker Python environments. .. note:: Experimental. """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) with SCCallSiteSync(self._sc): port, auth_secret, jsocket_auth_server = self._jdf.collectAsArrowToPython( ) # Collect list of un-ordered batches where last element is a list of correct order indices try: results = list( _load_from_socket((port, auth_secret), ArrowCollectSerializer())) finally: # Join serving thread and raise any exceptions from collectAsArrowToPython jsocket_auth_server.getResult() # Separate RecordBatches from batch order indices in results batches = results[:-1] batch_order = results[-1] # Re-order the batch list using the correct order return [batches[i] for i in batch_order]
def get_data_frame(self, name: str) -> pandas.DataFrame: """Get dataset with given name as a pandas dataframe. Raises ValueError if the specified dataset does not exist. """ import pyarrow as pa # type: ignore from pyspark.rdd import _load_from_socket # type: ignore from pyspark.sql.pandas.serializers import ArrowCollectSerializer # type: ignore name = name.lower() if name not in self.artifacts: raise ValueError("Unknown artifact: '{}'".format(name)) if self.artifacts[name].artifact_type != ARTIFACT_TYPE_DATASET: raise ValueError("Artifact '{}' is not a dataset".format(name)) response = self.vizier_request("get_data_frame", name=name, includeUncertainty=True, has_response=True) assert (response is not None) results = list( _load_from_socket((response['port'], response['secret']), ArrowCollectSerializer())) batches = results[:-1] batch_order = results[-1] ordered_batches = [batches[i] for i in batch_order] if len(ordered_batches) > 0: table = pa.Table.from_batches(ordered_batches) return table.to_pandas() else: raise Exception( "Error loading dataframe '{}'. It has no content.".format( name))
def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False): """ Executes the given partitionFunc on the specified set of partitions, returning the result as an array of elements. If 'partitions' is not specified, this will run over all partitions. >>> myRDD = sc.parallelize(range(6), 3) >>> sc.runJob(myRDD, lambda part: [x * x for x in part]) [0, 1, 4, 9, 16, 25] >>> myRDD = sc.parallelize(range(6), 3) >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True) [0, 1, 16, 25] """ if (testpy.SHIVAlog == 1): print("SHIVA LOG: Context.py in runJob()") if partitions is None: partitions = range(rdd._jrdd.partitions().size()) # Implementation note: This is implemented as a mapPartitions followed # by runJob() in order to avoid having to pass a Python lambda into # SparkContext#runJob. print("SHIVA LOG: Context.py calling mapPartitions") mappedRDD = rdd.mapPartitions(partitionFunc) if (testpy.SHIVAlog == 1): print("SHIVA LOG: context.py , completed mapPartitions and calling jvm.runJob") print("SHIVA LOG: Context.py calling runJob") port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions) # print("time is " + str(testpy.filterTotalTime) + " " + str(RDD.testTime) + " " + str(testpy.c.timeT)) print("SHIVA LOG: Context.py completed runJob") tmp = list(_load_from_socket(port, mappedRDD._jrdd_deserializer)) print("SHIVA LOG: RETURNED from _load_from_socket") return tmp
def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch"]: """ Returns all records as a list of ArrowRecordBatches, pyarrow must be installed and available on driver and worker Python environments. This is an experimental feature. :param split_batches: split batches such that each column is in its own allocation, so that the selfDestruct optimization is effective; default False. .. note:: Experimental. """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) with SCCallSiteSync(self._sc): ( port, auth_secret, jsocket_auth_server, ) = self._jdf.collectAsArrowToPython() # Collect list of un-ordered batches where last element is a list of correct order indices try: batch_stream = _load_from_socket((port, auth_secret), ArrowCollectSerializer()) if split_batches: # When spark.sql.execution.arrow.pyspark.selfDestruct.enabled, ensure # each column in each record batch is contained in its own allocation. # Otherwise, selfDestruct does nothing; it frees each column as its # converted, but each column will actually be a list of slices of record # batches, and so no memory is actually freed until all columns are # converted. import pyarrow as pa results = [] for batch_or_indices in batch_stream: if isinstance(batch_or_indices, pa.RecordBatch): batch_or_indices = pa.RecordBatch.from_arrays( [ # This call actually reallocates the array pa.concat_arrays([array]) for array in batch_or_indices ], schema=batch_or_indices.schema, ) results.append(batch_or_indices) else: results = list(batch_stream) finally: # Join serving thread and raise any exceptions from collectAsArrowToPython jsocket_auth_server.getResult() # Separate RecordBatches from batch order indices in results batches = results[:-1] batch_order = results[-1] # Re-order the batch list using the correct order return [batches[i] for i in batch_order]
def collect(self): """Returns all the records as a list of :class:`Row`. >>> df.collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd()) return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
def take(self, num): """Returns the first ``num`` rows as a :class:`list` of :class:`Row`. >>> df.take(2) [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe( self._jdf, num) return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
def collect(self): """Return a list that contains all of the rows. Each object in the list is a Row, the fields can be accessed as attributes. >>> df.collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd()) rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer()))) cls = _create_cls(self.schema) return [cls(r) for r in rs]
def get_dataset_frame( self, identifier: str, force_profiler: Optional[bool] = None) -> Optional[DataFrame]: import pyarrow as pa #type: ignore from pyspark.rdd import _load_from_socket #type: ignore from pyspark.sql.pandas.serializers import ArrowCollectSerializer #type: ignore portSecret = mimir.getDataframe( query='SELECT * FROM {}'.format(identifier)) results = list( _load_from_socket((portSecret['port'], portSecret['secret']), ArrowCollectSerializer())) batches = results[:-1] batch_order = results[-1] ordered_batches = [batches[i] for i in batch_order] table = pa.Table.from_batches(ordered_batches) return table.to_pandas()