コード例 #1
0
ファイル: main.py プロジェクト: abs-tudelft/SVCall
def createFromArrowRecordBatchesRDD(self, ardd, schema=None, timezone=None):
    #from pyspark.sql.types import from_arrow_schema
    #from pyspark.sql.dataframe import DataFrame
    #from pyspark.serializers import ArrowSerializer, PickleSerializer, AutoBatchedSerializer

    from pyspark.sql.pandas.types import from_arrow_schema
    from pyspark.sql.dataframe import DataFrame

    # Filter out and cache arrow record batches
    ardd = ardd.filter(lambda x: isinstance(x, pa.RecordBatch)).cache()

    ardd = ardd.map(_arrow_record_batch_dumps)

    #schema = pa.schema([pa.field('c0', pa.int16()),
    #                    pa.field('c1', pa.int32())],
    #                   metadata={b'foo': b'bar'})
    if (args.aligner == "BWA"):
        schema = from_arrow_schema(sam_schema())
    else:
        schema = from_arrow_schema(_schema())

    # Create the Spark DataFrame directly from the Arrow data and schema
    jrdd = ardd._to_java_object_rdd()
    jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(),
                                               self._wrapped._jsqlContext)
    df = DataFrame(jdf, self._wrapped)
    df._schema = schema

    return df
コード例 #2
0
 def test_schema_conversion_roundtrip(self):
     from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema
     arrow_schema = to_arrow_schema(self.schema)
     schema_rt = from_arrow_schema(arrow_schema)
     self.assertEqual(self.schema, schema_rt)