def test14(spark):
    # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol.
    # It should be selected at random by each process that writes records.
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    df = (spark.readStream.format("pravega").option(
        "controller", controller).option("scope", scope).option(
            "stream", "video").option("encoding", "chunked_v1").load())

    # Decode JSON event.
    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select(
        '*',
        from_json('event_string', schema=schema,
                  options=dict(mode='FAILFAST')).alias('event'))
    df = df.select('*', 'event.*')

    df = df.withWatermark('timestamp', '60 second')

    @udf(returnType=BinaryType())
    def parse_checksum(checksum_and_data):
        return checksum_and_data[0:4]

    @udf(returnType=BinaryType())
    def parse_data(checksum_and_data):
        return checksum_and_data[4:]

    @udf(returnType=BooleanType())
    def is_checksum_correct(checksum, data):
        expected = struct.unpack('!I', checksum)[0]
        calculated = zlib.crc32(data)
        # print('expected=%d, calculated=%d' % (expected, calculated))
        return expected == calculated

    df = df.withColumnRenamed('data', 'checksum_and_data')
    df = df.select('*',
                   parse_checksum('checksum_and_data').alias('checksum'),
                   parse_data('checksum_and_data').alias('data'))
    df = df.select(
        '*',
        is_checksum_correct('checksum', 'data').alias('is_checksum_correct'))
    df = df.select('*', length('data'))
    df = df.drop('raw_event', 'event_string', 'event', 'checksum_and_data',
                 'data')

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate', 'false').start().awaitTermination())
Esempio n. 2
0
    def test_infer_binary_type(self):
        binaryrow = [Row(f1='a', f2=b"abcd")]
        df = self.sc.parallelize(binaryrow).toDF()
        self.assertEqual(df.schema.fields[1].dataType, BinaryType())

        # this saving as Parquet caused issues as well.
        output_dir = os.path.join(self.tempdir.name, "infer_binary_type")
        df.write.parquet(output_dir)
        df1 = self.spark.read.parquet(output_dir)
        self.assertEqual('a', df1.first().f1)
        self.assertEqual(b"abcd", df1.first().f2)

        self.assertEqual(_infer_type(b""), BinaryType())
        self.assertEqual(_infer_type(b"1234"), BinaryType())
Esempio n. 3
0
    def test_data_type_ops(self):
        _mock_spark_type = DataType()
        _mock_dtype = ExtensionDtype()
        _mappings = (
            (CategoricalDtype(), _mock_spark_type, CategoricalOps),
            (_mock_dtype, DecimalType(), DecimalOps),
            (_mock_dtype, FractionalType(), FractionalOps),
            (_mock_dtype, IntegralType(), IntegralOps),
            (_mock_dtype, StringType(), StringOps),
            (_mock_dtype, BooleanType(), BooleanOps),
            (_mock_dtype, TimestampType(), DatetimeOps),
            (_mock_dtype, TimestampNTZType(), DatetimeNTZOps),
            (_mock_dtype, DateType(), DateOps),
            (_mock_dtype, DayTimeIntervalType(), TimedeltaOps),
            (_mock_dtype, BinaryType(), BinaryOps),
            (_mock_dtype, ArrayType(StringType()), ArrayOps),
            (_mock_dtype, MapType(StringType(), IntegralType()), MapOps),
            (_mock_dtype, StructType(), StructOps),
            (_mock_dtype, NullType(), NullOps),
            (_mock_dtype, UserDefinedType(), UDTOps),
        )
        for _dtype, _spark_type, _ops in _mappings:
            self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops)

        _unknow_spark_type = _mock_spark_type
        self.assertRaises(TypeError, DataTypeOps, BooleanType(),
                          _unknow_spark_type)
Esempio n. 4
0
    def setUpClass(cls):
        from datetime import date, datetime
        from decimal import Decimal
        super(ArrowTests, cls).setUpClass()
        cls.warnings_lock = threading.Lock()

        # Synchronize default timezone between Python and Java
        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
        tz = "America/Los_Angeles"
        os.environ["TZ"] = tz
        time.tzset()

        cls.spark.conf.set("spark.sql.session.timeZone", tz)

        # Test fallback
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "false"
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "true"

        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true"
        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false"

        # Enable Arrow optimization in this tests.
        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        # Disable fallback by default to easily detect the failures.
        cls.spark.conf.set(
            "spark.sql.execution.arrow.pyspark.fallback.enabled", "false")

        cls.schema_wo_null = StructType([
            StructField("1_str_t", StringType(), True),
            StructField("2_int_t", IntegerType(), True),
            StructField("3_long_t", LongType(), True),
            StructField("4_float_t", FloatType(), True),
            StructField("5_double_t", DoubleType(), True),
            StructField("6_decimal_t", DecimalType(38, 18), True),
            StructField("7_date_t", DateType(), True),
            StructField("8_timestamp_t", TimestampType(), True),
            StructField("9_binary_t", BinaryType(), True)
        ])
        cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True)
        cls.data_wo_null = [
            (u"a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1),
             datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a")),
            (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2),
             datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb")),
            (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3),
             datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc")),
            (u"d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12),
             datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd")),
        ]
        cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
Esempio n. 5
0
class DataType(Enum):
    """Holds constants for data types within Butterfree."""

    TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP")
    BINARY = (BinaryType(), "boolean", "BINARY")
    BOOLEAN = (BooleanType(), "boolean", "BOOLEAN")
    DATE = (DateType(), "timestamp", "DATE")
    DECIMAL = (DecimalType(), "decimal", "DECIMAL")
    DOUBLE = (DoubleType(), "double", "DOUBLE")
    FLOAT = (FloatType(), "float", "FLOAT")
    INTEGER = (IntegerType(), "int", "INT")
    BIGINT = (LongType(), "bigint", "BIGINT")
    STRING = (StringType(), "text", "STRING")
    ARRAY_BIGINT = (ArrayType(LongType()), "frozen<list<bigint>>",
                    "ARRAY<BIGINT>")
    ARRAY_STRING = (ArrayType(StringType()), "frozen<list<text>>",
                    "ARRAY<STRING>")
    ARRAY_FLOAT = (ArrayType(FloatType()), "frozen<list<float>>",
                   "ARRAY<FLOAT>")

    def __init__(self, spark: PySparkDataType, cassandra: str,
                 spark_sql: str) -> None:
        self.spark = spark
        self.cassandra = cassandra
        self.spark_sql = spark_sql
Esempio n. 6
0
 def test_BinaryType_serialization(self):
     # Pyrolite version <= 4.9 could not serialize BinaryType with Python3 SPARK-17808
     # The empty bytearray is test for SPARK-21534.
     schema = StructType([StructField('mybytes', BinaryType())])
     data = [[bytearray(b'here is my data')],
             [bytearray(b'and here is some more')], [bytearray(b'')]]
     df = self.spark.createDataFrame(data, schema=schema)
     df.collect()
 def test_filesTODF(self):
     df = imageIO.filesToDF(self.binaryFilesMock, "path", 217)
     self.assertEqual(df.rdd.getNumPartitions(), 217)
     df.schema.fields[0].dataType == StringType()
     df.schema.fields[0].dataType == BinaryType()
     first = df.first()
     self.assertTrue(hasattr(first, "filePath"))
     self.assertEqual(type(first.fileData), bytearray)
Esempio n. 8
0
def heatmap(vega, df):
    if df.rdd.isEmpty():
        return None

    if len(df.schema.names) != 2:
        return None

    col_point = df.schema.names[0]
    col_count = df.schema.names[1]
    from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, col
    from pyspark.sql.types import (StructType, StructField, BinaryType,
                                   StringType, IntegerType)
    from ._wrapper_func import TransformAndProjection, Projection
    coor = vega.coor()
    bounding_box = vega.bounding_box()
    height = vega.height()
    width = vega.width()
    top_left = 'POINT (' + str(bounding_box[0]) + ' ' + str(
        bounding_box[3]) + ')'
    bottom_right = 'POINT (' + str(bounding_box[2]) + ' ' + str(
        bounding_box[1]) + ')'
    if coor != 'EPSG:3857':
        df = df.select(
            TransformAndProjection(col(col_point), lit(str(coor)),
                                   lit('EPSG:3857'), lit(bottom_right),
                                   lit(top_left), lit(int(height)),
                                   lit(int(width))).alias(col_point),
            col(col_count))
    else:
        df = df.select(
            Projection(col(col_point), lit(bottom_right), lit(top_left),
                       lit(int(height)), lit(int(width))).alias(col_point),
            col(col_count))

    agg_schema = StructType([
        StructField(col_point, BinaryType(), True),
        StructField(col_count, IntegerType(), True)
    ])

    @pandas_udf(agg_schema, PandasUDFType.MAP_ITER)
    def render_agg_UDF(batch_iter):
        for pdf in batch_iter:
            dd = pdf.groupby([col_point])
            dd = dd[col_count].agg(['sum']).reset_index()
            dd.columns = [col_point, col_count]
            yield dd

    @pandas_udf("string", PandasUDFType.GROUPED_AGG)
    def heatmap_wkb(point, w, conf=vega):
        from arctern import heat_map
        return heat_map(conf, point, w)

    agg_df = df.mapInPandas(render_agg_UDF)
    agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF()
    hex_data = agg_df.agg(heatmap_wkb(agg_df[col_point],
                                      agg_df[col_count])).collect()[0][0]
    return hex_data
def test_primitive(test_ctx):
    schema = StructType([
        StructField("bool_col", BooleanType(), False),
        StructField("float_col", FloatType(), False),
        StructField("double_col", DoubleType(), False),
        StructField("short_col", ShortType(), False),
        StructField("int_col", IntegerType(), False),
        StructField("long_col", LongType(), False),
        StructField("str_col", StringType(), False),
        StructField("bin_col", BinaryType(), False),
        StructField("byte_col", ByteType(), False),
    ])
    df = test_ctx.spark.createDataFrame(
        [(True, 0.12, 432.1, 5, 5, 0, "hello", bytearray(b"spark\x01\x02"),
          -128),
         (False, 123.45, 0.987, 9, 908, 765, "petastorm",
          bytearray(b"\x0012345"), 127)],
        schema=schema).coalesce(1)
    # If we use numPartition > 1, the order of the loaded dataset would
    # be non-deterministic.
    expected_df = df.collect()

    converter = make_spark_converter(df)
    with converter.make_tf_dataset() as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
            # TODO: we will improve the test once the batch_size argument
            #  added.
            # Now we only have one batch.
        for i in range(converter.dataset_size):
            for col in df.schema.names:
                actual_ele = getattr(ts, col)[i]
                expected_ele = expected_df[i][col]
                if col == "str_col":
                    actual_ele = actual_ele.decode()
                if col == "bin_col":
                    actual_ele = bytearray(actual_ele)
                if col == "float_col" or col == "double_col":
                    # Note that the default dtype is float32
                    assert pytest.approx(expected_ele, rel=1e-6) == actual_ele
                else:
                    assert expected_ele == actual_ele

        assert len(expected_df) == len(converter)

    assert np.bool_ == ts.bool_col.dtype.type
    assert np.float32 == ts.float_col.dtype.type
    # Default dtype float32
    assert np.float32 == ts.double_col.dtype.type
    assert np.int16 == ts.short_col.dtype.type
    assert np.int32 == ts.int_col.dtype.type
    assert np.int64 == ts.long_col.dtype.type
    assert np.object_ == ts.str_col.dtype.type
    assert np.object_ == ts.bin_col.dtype.type
Esempio n. 10
0
def get_downloaded_images_df(input_parquet, sum_accumulator):
    download_image_udf = spark.udf.register(
        "download_image", lambda x: download_image(x, sum_accumulator),
        BinaryType())

    downloaded_images_df = spark.read.parquet(input_parquet) \
        .select("id", "photo_video_download_url") \
        .withColumn('image_bytes', download_image_udf('photo_video_download_url')) \
        .drop('photo_video_download_url')

    return downloaded_images_df
def spark_streaming_to_pubsublite(project_number: int, location: str,
                                  topic_id: str) -> None:
    # [START pubsublite_spark_streaming_to_pubsublite]
    from pyspark.sql import SparkSession
    from pyspark.sql.types import BinaryType, StringType
    import uuid

    # TODO(developer):
    # project_number = 11223344556677
    # location = "us-central1-a"
    # topic_id = "your-topic-id"

    spark = SparkSession.builder.appName("write-app").master(
        "yarn").getOrCreate()

    # Create a RateStreamSource that generates consecutive numbers with timestamps:
    # |-- timestamp: timestamp (nullable = true)
    # |-- value: long (nullable = true)
    sdf = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

    sdf = (sdf.withColumn("key", (sdf.value % 5).cast(StringType()).cast(
        BinaryType())).withColumn("event_timestamp", sdf.timestamp).withColumn(
            "data",
            sdf.value.cast(StringType()).cast(BinaryType())).drop(
                "value", "timestamp"))

    sdf.printSchema()

    query = (
        sdf.writeStream.format("pubsublite").option(
            "pubsublite.topic",
            f"projects/{project_number}/locations/{location}/topics/{topic_id}",
        )
        # Required. Use a unique checkpoint location for each job.
        .option("checkpointLocation",
                "/tmp/app" + uuid.uuid4().hex).outputMode("append").trigger(
                    processingTime="1 second").start())

    # Wait 60 seconds to terminate the query.
    query.awaitTermination(60)
    query.stop()
Esempio n. 12
0
def _agg_func_template(df, col_name, st_agg_func):
    import pandas as pd
    from pyspark.sql.functions import pandas_udf, PandasUDFType
    from pyspark.sql.types import (StructType, StructField, BinaryType)

    agg_schema = StructType([StructField('geos', BinaryType(), True)])
    @pandas_udf(agg_schema, PandasUDFType.MAP_ITER)
    def agg_step1(batch_iter, col_name=col_name):
        for pdf in batch_iter:
            ret = st_agg_func(pdf[col_name])
            df = pd.DataFrame({"geos": [ret[0]]})
            yield df

    @pandas_udf(BinaryType(), PandasUDFType.GROUPED_AGG)
    def agg_step2(geos):
        return st_agg_func(geos)[0]

    agg_df = df.mapInPandas(agg_step1)
    agg_df = agg_df.coalesce(1)
    ret = agg_df.agg(agg_step2(agg_df['geos'])).collect()[0][0]
    return ret
Esempio n. 13
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
             for field in at])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
Esempio n. 14
0
 def create_schema(col_schema: dict) -> StructType:
     type_mapping = {
         'str': StringType(),
         'int': IntegerType(),
         'float': FloatType(),
         'bool': BooleanType(),
         'date': DateType(),
         'bytes': BinaryType()
     }
     schema = [StructField(col_name, type_mapping.get(field_type), True) for col_name, field_type in
               col_schema.items()]
     return StructType(fields=schema)
Esempio n. 15
0
    def test_image_round_trip(self):
        # Test round trip: array -> png -> sparkImg -> array
        binarySchema = StructType([StructField("data", BinaryType(), False)])
        df = self.session.createDataFrame([[bytearray(pngData)]], binarySchema)

        # Convert to images
        decImg = udf(imageIO._decodeImage, imageIO.imageSchema)
        imageDF = df.select(decImg("data").alias("image"))
        row = imageDF.first()

        testArray = imageIO.imageStructToArray(row.image)
        self.assertEqual(testArray.shape, array.shape)
        self.assertEqual(testArray.dtype, array.dtype)
        self.assertTrue(np.all(array == testArray))
Esempio n. 16
0
def choroplethmap(df, vega):
    from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit
    from pyspark.sql.types import (StructType, StructField, BinaryType,
                                   StringType, IntegerType)
    from ._wrapper_func import TransformAndProjection
    coor = vega.coor()
    bounding_box = vega.bounding_box()
    height = vega.height()
    width = vega.width()
    top_left = 'POINT (' + str(bounding_box[0]) + ' ' + str(
        bounding_box[3]) + ')'
    bottom_right = 'POINT (' + str(bounding_box[2]) + ' ' + str(
        bounding_box[1]) + ')'
    if (coor != 'EPSG:3857'):
        df = df.select(
            TransformAndProjection(col('wkt'), lit(str(coor)),
                                   lit('EPSG:3857'), lit(bottom_right),
                                   lit(top_left), lit(int(height)),
                                   lit(int(width))).alias("wkb"), col('w'))

    vega = vega.build()
    agg_schema = StructType([
        StructField('wkb', BinaryType(), True),
        StructField('w', IntegerType(), True)
    ])

    @pandas_udf(agg_schema, PandasUDFType.MAP_ITER)
    def render_agg_UDF(batch_iter):
        for pdf in batch_iter:
            dd = pdf.groupby(['wkb'])
            dd = dd['w'].agg(['sum']).reset_index()
            dd.columns = ['wkb', 'w']
            yield dd

    @pandas_udf("string", PandasUDFType.GROUPED_AGG)
    def choroplethmap_wkb(wkb, w, conf=vega):
        from arctern import choropleth_map
        return choropleth_map(wkb, w, conf.encode('utf-8'))

    @pandas_udf("double", PandasUDFType.GROUPED_AGG)
    def sum_udf(v):
        return v.sum()

    agg_df = df.where("wkb != ''")
    agg_df = agg_df.mapInPandas(render_agg_UDF)
    agg_df = agg_df.coalesce(1)
    hex_data = agg_df.agg(choroplethmap_wkb(agg_df['wkb'],
                                            agg_df['w'])).collect()[0][0]
    return hex_data
Esempio n. 17
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type
Esempio n. 18
0
    def sqlType(cls):
        """
        Mirrors `schema` in scala companion object org.apache.spark.sql.rf.TileUDT
        """
        extent = StructType([
            StructField("xmin", DoubleType(), True),
            StructField("ymin", DoubleType(), True),
            StructField("xmax", DoubleType(), True),
            StructField("ymax", DoubleType(), True)
        ])
        grid = StructType([
            StructField("colMin", IntegerType(), True),
            StructField("rowMin", IntegerType(), True),
            StructField("colMax", IntegerType(), True),
            StructField("rowMax", IntegerType(), True)
        ])

        ref = StructType([
            StructField(
                "source",
                StructType(
                    [StructField("raster_source_kryo", BinaryType(), False)]),
                True),
            StructField("bandIndex", IntegerType(), True),
            StructField("subextent", extent, True),
            StructField("subgrid", grid, True),
        ])

        return StructType([
            StructField("cellType", StringType(), False),
            StructField("cols", IntegerType(), False),
            StructField("rows", IntegerType(), False),
            StructField("cells", BinaryType(), True),
            StructField("gridBounds", grid, True),
            StructField("ref", ref, True)
        ])
Esempio n. 19
0
def sqlalchemy_spark_type(sqlalchemy_type: sqlalchemy.types.TypeEngine):
    if isinstance(sqlalchemy_type, sqlalchemy.types.Binary):
        return BinaryType()
    elif isinstance(sqlalchemy_type, sqlalchemy.types.Boolean):
        return BooleanType()
    elif isinstance(sqlalchemy_type, sqlalchemy.types.Date):
        return DateType()
    elif isinstance(sqlalchemy_type, sqlalchemy.types.DateTime):
        return TimestampType()
    elif isinstance(sqlalchemy_type, sqlalchemy.types.Integer):
        return LongType()
    elif isinstance(sqlalchemy_type, sqlalchemy.types.String):
        return StringType()
    elif isinstance(sqlalchemy_type, sqlalchemy.types.Float):
        return DoubleType()
Esempio n. 20
0
def test_primitives():
    assert BooleanType() == parse_schema("bool")
    assert BooleanType() == parse_schema("boolean")

    assert ByteType() == parse_schema("byte")
    assert ByteType() == parse_schema("tinyint")

    assert ShortType() == parse_schema("short")
    assert ShortType() == parse_schema("smallint")

    assert IntegerType() == parse_schema("int")
    assert FloatType() == parse_schema("float")
    assert DoubleType() == parse_schema("double")

    assert StringType() == parse_schema("string")
    assert BinaryType() == parse_schema("binary")
Esempio n. 21
0
def filesToDF(sc, path, numPartitions=None):
    """
    Read files from a directory to a DataFrame.

    :param sc: SparkContext.
    :param path: str, path to files.
    :param numPartition: int, number or partitions to use for reading files.
    :return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
    """
    numPartitions = numPartitions or sc.defaultParallelism
    schema = StructType([StructField("filePath", StringType(), False),
                         StructField("fileData", BinaryType(), False)])
    rdd = sc.binaryFiles(
        path, minPartitions=numPartitions).repartition(numPartitions)
    rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
    return rdd.toDF(schema)
Esempio n. 22
0
def run_pyspark_pipeline(dir_path, spark, cores, out_path):
    """ Reads parquet files from 'dir_path' and parses trec_car_tools.Page object to create protobuf with entity
    linking. """

    print('start preprocessin')
    start_preprocess = time.time()

    # Reads parquet files from 'dir_path' - each row is a TREC CAR pages.
    df_in = spark.read.parquet(dir_path)
    df_in.printSchema()
    num_partitions = df_in.rdd.getNumPartitions()
    print("Number of default partitions: {}".format(num_partitions))

    print('end preprocess')
    end_preprocess = time.time()
    print("*** preprocess time: {:.2f}s ***".format(end_preprocess -
                                                    start_preprocess))

    print('start pyspark_processing job')
    start_pyspark_job = time.time()

    if num_partitions < cores * 4:
        print('repartitioning df')
        df_in = df_in.repartition(cores * 4)
        print("Number of partitions should equal 4*cores --> {}".format(
            df_in.rdd.getNumPartitions()))

    @udf(returnType=BinaryType())
    def parse_udf(page_bytearray):
        # Parses trec_car_tools.Page object to create protobuf with entity linking.
        page = pickle.loads(page_bytearray)
        tp = TrecCarParser()
        doc = tp.parse_page_to_protobuf(page=page)
        doc_bytearray = pickle.dumps(doc.SerializeToString())
        return doc_bytearray

    # Add index to DF.
    df_parse = df_in.withColumn("doc_bytearray", parse_udf("page_bytearray"))
    df_parse = df_parse.withColumn(
        "index",
        row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)
    df_parse.write.parquet(out_path)

    print('end pyspark_processing job')
    end_pyspark_job = time.time()
    print("*** pyspark_processing job time: {:.2f}s ***".format(
        end_pyspark_job - start_pyspark_job))
Esempio n. 23
0
def _map_field_type(f_type):
    struct_fields = {
        "string": StringType(),
        "binary": BinaryType(),
        "boolean": BooleanType(),
        "date": StringType(),
        "timestamp": StringType(),
        "decimal": DecimalType(),
        "double": DoubleType(),
        "float": FloatType(),
        "number": StringType(),
        "byte": ByteType(),
        "integer": IntegerType(),
        "long": LongType(),
        "short": ShortType(),
        None: StringType()
    }
    return struct_fields.get(f_type, StringType())
Esempio n. 24
0
    def test_toPandas_empty_df_arrow_enabled(self):
        # SPARK-30537 test that toPandas() on an empty dataframe has the correct dtypes
        # when arrow is enabled
        from datetime import date
        from decimal import Decimal

        schema = StructType([
            StructField("a", StringType(), True),
            StructField("a", IntegerType(), True),
            StructField("c", TimestampType(), True),
            StructField("d", NullType(), True),
            StructField("e", LongType(), True),
            StructField("f", FloatType(), True),
            StructField("g", DateType(), True),
            StructField("h", BinaryType(), True),
            StructField("i", DecimalType(38, 18), True),
            StructField("k", TimestampNTZType(), True),
            StructField("L", DayTimeIntervalType(0, 3), True),
        ])
        df = self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(),
                                        schema=schema)
        non_empty_df = self.spark.createDataFrame(
            [(
                "a",
                1,
                datetime.datetime(1969, 1, 1, 1, 1, 1),
                None,
                10,
                0.2,
                date(1969, 1, 1),
                bytearray(b"a"),
                Decimal("2.0"),
                datetime.datetime(1969, 1, 1, 1, 1, 1),
                datetime.timedelta(microseconds=123),
            )],
            schema=schema,
        )

        pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
        pdf_non_empty, pdf_arrow_non_empty = self._toPandas_arrow_toggle(
            non_empty_df)
        assert_frame_equal(pdf, pdf_arrow)
        self.assertTrue(pdf_arrow.dtypes.equals(pdf_arrow_non_empty.dtypes))
        self.assertTrue(pdf_arrow.dtypes.equals(pdf_non_empty.dtypes))
Esempio n. 25
0
 def sqlType(cls) -> StructType:
     return StructType(fields=[
         # dtype field will use dictionary encoding.
         StructField(
             "dtype",
             StringType(),
             False,
         ),
         StructField(
             "shape",
             ArrayType(IntegerType(), False),
             False,
         ),
         StructField(
             "data",
             BinaryType(),
             False,
         ),
     ])
Esempio n. 26
0
  def _infer_sql_type(k, v):
    # special handling for binary features
    if k in binary_features:
      return BinaryType()

    if v.int64_list.value:
      result = v.int64_list.value
      sql_type = LongType()
    elif v.float_list.value:
      result = v.float_list.value
      sql_type = DoubleType()
    else:
      result = v.bytes_list.value
      sql_type = StringType()

    if len(result) > 1:             # represent multi-item tensors as Spark SQL ArrayType() of base types
      return ArrayType(sql_type)
    else:                           # represent everything else as base types (and empty tensors as StringType())
      return sql_type
Esempio n. 27
0
class DataType(Enum):
    """Holds constants for data types within Butterfree."""

    TIMESTAMP = (TimestampType(), "timestamp")
    BINARY = (BinaryType(), "boolean")
    BOOLEAN = (BooleanType(), "boolean")
    DATE = (DateType(), "timestamp")
    DECIMAL = (DecimalType(), "decimal")
    DOUBLE = (DoubleType(), "double")
    FLOAT = (FloatType(), "float")
    INTEGER = (IntegerType(), "int")
    BIGINT = (LongType(), "bigint")
    STRING = (StringType(), "text")
    ARRAY_BIGINT = (ArrayType(LongType()), "frozen<list<bigint>>")
    ARRAY_STRING = (ArrayType(StringType()), "frozen<list<text>>")
    ARRAY_FLOAT = (ArrayType(FloatType()), "frozen<list<float>>")

    def __init__(self, spark, cassandra):
        self.spark = spark
        self.cassandra = cassandra
Esempio n. 28
0
 def sqlType(cls) -> StructType:
     return StructType(
         fields=[
             StructField(
                 "dtype",
                 ShortType(),
                 False,
             ),
             StructField(
                 "shape",
                 ArrayType(IntegerType(), False),
                 False,
             ),
             StructField(
                 "data",
                 BinaryType(),
                 False,
             ),
         ]
     )
Esempio n. 29
0
 def sqlType(cls):
     """
     Mirrors `schema` in scala companion object org.apache.spark.sql.rf.TileUDT
     """
     return StructType([
         StructField(
             "cell_context",
             StructType([
                 StructField(
                     "cellType",
                     StructType(
                         [StructField("cellTypeName", StringType(),
                                      False)]), False),
                 StructField(
                     "dimensions",
                     StructType([
                         StructField("cols", ShortType(), False),
                         StructField("rows", ShortType(), False)
                     ]), False),
             ]), False),
         StructField(
             "cell_data",
             StructType([
                 StructField("cells", BinaryType(), True),
                 StructField(
                     "ref",
                     StructType([
                         StructField("source", RasterSourceUDT(), False),
                         StructField("bandIndex", IntegerType(), False),
                         StructField(
                             "subextent",
                             StructType([
                                 StructField("xmin", DoubleType(), False),
                                 StructField("ymin", DoubleType(), False),
                                 StructField("xmax", DoubleType(), False),
                                 StructField("ymax", DoubleType(), False)
                             ]), True)
                     ]), True)
             ]), False)
     ])
def test13(spark):
    # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol.
    # It should be selected at random by each process that writes records.
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples5')
    df = (spark.readStream.format("pravega").option(
        "controller",
        controller).option("scope", scope).option("stream", "video").load())

    @udf(returnType=IntegerType())
    def parse_chunk_index(event):
        unpacked = struct.unpack('!bxxxhh', event[0:8])
        return unpacked[1]

    @udf(returnType=IntegerType())
    def parse_final_chunk_index(event):
        unpacked = struct.unpack('!bxxxhh', event[0:8])
        return unpacked[2]

    @udf(returnType=BinaryType())
    def parse_payload(event):
        return event[8:]

    df = df.select('*', parse_chunk_index('event').alias('chunk_index'))
    df = df.select('*',
                   parse_final_chunk_index('event').alias('final_chunk_index'))
    df = df.select('*', parse_payload('event').alias('payload'))

    df.printSchema()

    if True:
        (df.writeStream.trigger(
            processingTime='3 seconds')  # limit trigger rate
         .outputMode('append').format('console')
         # .option('truncate', 'false')
         .start().awaitTermination())