def createFromArrowRecordBatchesRDD(self, ardd, schema=None, timezone=None): #from pyspark.sql.types import from_arrow_schema #from pyspark.sql.dataframe import DataFrame #from pyspark.serializers import ArrowSerializer, PickleSerializer, AutoBatchedSerializer from pyspark.sql.pandas.types import from_arrow_schema from pyspark.sql.dataframe import DataFrame # Filter out and cache arrow record batches ardd = ardd.filter(lambda x: isinstance(x, pa.RecordBatch)).cache() ardd = ardd.map(_arrow_record_batch_dumps) #schema = pa.schema([pa.field('c0', pa.int16()), # pa.field('c1', pa.int32())], # metadata={b'foo': b'bar'}) if (args.aligner == "BWA"): schema = from_arrow_schema(sam_schema()) else: schema = from_arrow_schema(_schema()) # Create the Spark DataFrame directly from the Arrow data and schema jrdd = ardd._to_java_object_rdd() jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), self._wrapped._jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def test_schema_conversion_roundtrip(self): from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema arrow_schema = to_arrow_schema(self.schema) schema_rt = from_arrow_schema(arrow_schema) self.assertEqual(self.schema, schema_rt)