def test14(spark): # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol. # It should be selected at random by each process that writes records. schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1").load()) # Decode JSON event. df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select( '*', from_json('event_string', schema=schema, options=dict(mode='FAILFAST')).alias('event')) df = df.select('*', 'event.*') df = df.withWatermark('timestamp', '60 second') @udf(returnType=BinaryType()) def parse_checksum(checksum_and_data): return checksum_and_data[0:4] @udf(returnType=BinaryType()) def parse_data(checksum_and_data): return checksum_and_data[4:] @udf(returnType=BooleanType()) def is_checksum_correct(checksum, data): expected = struct.unpack('!I', checksum)[0] calculated = zlib.crc32(data) # print('expected=%d, calculated=%d' % (expected, calculated)) return expected == calculated df = df.withColumnRenamed('data', 'checksum_and_data') df = df.select('*', parse_checksum('checksum_and_data').alias('checksum'), parse_data('checksum_and_data').alias('data')) df = df.select( '*', is_checksum_correct('checksum', 'data').alias('is_checksum_correct')) df = df.select('*', length('data')) df = df.drop('raw_event', 'event_string', 'event', 'checksum_and_data', 'data') df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').start().awaitTermination())
def test_infer_binary_type(self): binaryrow = [Row(f1='a', f2=b"abcd")] df = self.sc.parallelize(binaryrow).toDF() self.assertEqual(df.schema.fields[1].dataType, BinaryType()) # this saving as Parquet caused issues as well. output_dir = os.path.join(self.tempdir.name, "infer_binary_type") df.write.parquet(output_dir) df1 = self.spark.read.parquet(output_dir) self.assertEqual('a', df1.first().f1) self.assertEqual(b"abcd", df1.first().f2) self.assertEqual(_infer_type(b""), BinaryType()) self.assertEqual(_infer_type(b"1234"), BinaryType())
def test_data_type_ops(self): _mock_spark_type = DataType() _mock_dtype = ExtensionDtype() _mappings = ( (CategoricalDtype(), _mock_spark_type, CategoricalOps), (_mock_dtype, DecimalType(), DecimalOps), (_mock_dtype, FractionalType(), FractionalOps), (_mock_dtype, IntegralType(), IntegralOps), (_mock_dtype, StringType(), StringOps), (_mock_dtype, BooleanType(), BooleanOps), (_mock_dtype, TimestampType(), DatetimeOps), (_mock_dtype, TimestampNTZType(), DatetimeNTZOps), (_mock_dtype, DateType(), DateOps), (_mock_dtype, DayTimeIntervalType(), TimedeltaOps), (_mock_dtype, BinaryType(), BinaryOps), (_mock_dtype, ArrayType(StringType()), ArrayOps), (_mock_dtype, MapType(StringType(), IntegralType()), MapOps), (_mock_dtype, StructType(), StructOps), (_mock_dtype, NullType(), NullOps), (_mock_dtype, UserDefinedType(), UDTOps), ) for _dtype, _spark_type, _ops in _mappings: self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops) _unknow_spark_type = _mock_spark_type self.assertRaises(TypeError, DataTypeOps, BooleanType(), _unknow_spark_type)
def setUpClass(cls): from datetime import date, datetime from decimal import Decimal super(ArrowTests, cls).setUpClass() cls.warnings_lock = threading.Lock() # Synchronize default timezone between Python and Java cls.tz_prev = os.environ.get("TZ", None) # save current tz if set tz = "America/Los_Angeles" os.environ["TZ"] = tz time.tzset() cls.spark.conf.set("spark.sql.session.timeZone", tz) # Test fallback cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "false" cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false" # Enable Arrow optimization in this tests. cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # Disable fallback by default to easily detect the failures. cls.spark.conf.set( "spark.sql.execution.arrow.pyspark.fallback.enabled", "false") cls.schema_wo_null = StructType([ StructField("1_str_t", StringType(), True), StructField("2_int_t", IntegerType(), True), StructField("3_long_t", LongType(), True), StructField("4_float_t", FloatType(), True), StructField("5_double_t", DoubleType(), True), StructField("6_decimal_t", DecimalType(38, 18), True), StructField("7_date_t", DateType(), True), StructField("8_timestamp_t", TimestampType(), True), StructField("9_binary_t", BinaryType(), True) ]) cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True) cls.data_wo_null = [ (u"a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a")), (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb")), (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc")), (u"d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12), datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd")), ] cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
class DataType(Enum): """Holds constants for data types within Butterfree.""" TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP") BINARY = (BinaryType(), "boolean", "BINARY") BOOLEAN = (BooleanType(), "boolean", "BOOLEAN") DATE = (DateType(), "timestamp", "DATE") DECIMAL = (DecimalType(), "decimal", "DECIMAL") DOUBLE = (DoubleType(), "double", "DOUBLE") FLOAT = (FloatType(), "float", "FLOAT") INTEGER = (IntegerType(), "int", "INT") BIGINT = (LongType(), "bigint", "BIGINT") STRING = (StringType(), "text", "STRING") ARRAY_BIGINT = (ArrayType(LongType()), "frozen<list<bigint>>", "ARRAY<BIGINT>") ARRAY_STRING = (ArrayType(StringType()), "frozen<list<text>>", "ARRAY<STRING>") ARRAY_FLOAT = (ArrayType(FloatType()), "frozen<list<float>>", "ARRAY<FLOAT>") def __init__(self, spark: PySparkDataType, cassandra: str, spark_sql: str) -> None: self.spark = spark self.cassandra = cassandra self.spark_sql = spark_sql
def test_BinaryType_serialization(self): # Pyrolite version <= 4.9 could not serialize BinaryType with Python3 SPARK-17808 # The empty bytearray is test for SPARK-21534. schema = StructType([StructField('mybytes', BinaryType())]) data = [[bytearray(b'here is my data')], [bytearray(b'and here is some more')], [bytearray(b'')]] df = self.spark.createDataFrame(data, schema=schema) df.collect()
def test_filesTODF(self): df = imageIO.filesToDF(self.binaryFilesMock, "path", 217) self.assertEqual(df.rdd.getNumPartitions(), 217) df.schema.fields[0].dataType == StringType() df.schema.fields[0].dataType == BinaryType() first = df.first() self.assertTrue(hasattr(first, "filePath")) self.assertEqual(type(first.fileData), bytearray)
def heatmap(vega, df): if df.rdd.isEmpty(): return None if len(df.schema.names) != 2: return None col_point = df.schema.names[0] col_count = df.schema.names[1] from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, col from pyspark.sql.types import (StructType, StructField, BinaryType, StringType, IntegerType) from ._wrapper_func import TransformAndProjection, Projection coor = vega.coor() bounding_box = vega.bounding_box() height = vega.height() width = vega.width() top_left = 'POINT (' + str(bounding_box[0]) + ' ' + str( bounding_box[3]) + ')' bottom_right = 'POINT (' + str(bounding_box[2]) + ' ' + str( bounding_box[1]) + ')' if coor != 'EPSG:3857': df = df.select( TransformAndProjection(col(col_point), lit(str(coor)), lit('EPSG:3857'), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_point), col(col_count)) else: df = df.select( Projection(col(col_point), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_point), col(col_count)) agg_schema = StructType([ StructField(col_point, BinaryType(), True), StructField(col_count, IntegerType(), True) ]) @pandas_udf(agg_schema, PandasUDFType.MAP_ITER) def render_agg_UDF(batch_iter): for pdf in batch_iter: dd = pdf.groupby([col_point]) dd = dd[col_count].agg(['sum']).reset_index() dd.columns = [col_point, col_count] yield dd @pandas_udf("string", PandasUDFType.GROUPED_AGG) def heatmap_wkb(point, w, conf=vega): from arctern import heat_map return heat_map(conf, point, w) agg_df = df.mapInPandas(render_agg_UDF) agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF() hex_data = agg_df.agg(heatmap_wkb(agg_df[col_point], agg_df[col_count])).collect()[0][0] return hex_data
def test_primitive(test_ctx): schema = StructType([ StructField("bool_col", BooleanType(), False), StructField("float_col", FloatType(), False), StructField("double_col", DoubleType(), False), StructField("short_col", ShortType(), False), StructField("int_col", IntegerType(), False), StructField("long_col", LongType(), False), StructField("str_col", StringType(), False), StructField("bin_col", BinaryType(), False), StructField("byte_col", ByteType(), False), ]) df = test_ctx.spark.createDataFrame( [(True, 0.12, 432.1, 5, 5, 0, "hello", bytearray(b"spark\x01\x02"), -128), (False, 123.45, 0.987, 9, 908, 765, "petastorm", bytearray(b"\x0012345"), 127)], schema=schema).coalesce(1) # If we use numPartition > 1, the order of the loaded dataset would # be non-deterministic. expected_df = df.collect() converter = make_spark_converter(df) with converter.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) # TODO: we will improve the test once the batch_size argument # added. # Now we only have one batch. for i in range(converter.dataset_size): for col in df.schema.names: actual_ele = getattr(ts, col)[i] expected_ele = expected_df[i][col] if col == "str_col": actual_ele = actual_ele.decode() if col == "bin_col": actual_ele = bytearray(actual_ele) if col == "float_col" or col == "double_col": # Note that the default dtype is float32 assert pytest.approx(expected_ele, rel=1e-6) == actual_ele else: assert expected_ele == actual_ele assert len(expected_df) == len(converter) assert np.bool_ == ts.bool_col.dtype.type assert np.float32 == ts.float_col.dtype.type # Default dtype float32 assert np.float32 == ts.double_col.dtype.type assert np.int16 == ts.short_col.dtype.type assert np.int32 == ts.int_col.dtype.type assert np.int64 == ts.long_col.dtype.type assert np.object_ == ts.str_col.dtype.type assert np.object_ == ts.bin_col.dtype.type
def get_downloaded_images_df(input_parquet, sum_accumulator): download_image_udf = spark.udf.register( "download_image", lambda x: download_image(x, sum_accumulator), BinaryType()) downloaded_images_df = spark.read.parquet(input_parquet) \ .select("id", "photo_video_download_url") \ .withColumn('image_bytes', download_image_udf('photo_video_download_url')) \ .drop('photo_video_download_url') return downloaded_images_df
def spark_streaming_to_pubsublite(project_number: int, location: str, topic_id: str) -> None: # [START pubsublite_spark_streaming_to_pubsublite] from pyspark.sql import SparkSession from pyspark.sql.types import BinaryType, StringType import uuid # TODO(developer): # project_number = 11223344556677 # location = "us-central1-a" # topic_id = "your-topic-id" spark = SparkSession.builder.appName("write-app").master( "yarn").getOrCreate() # Create a RateStreamSource that generates consecutive numbers with timestamps: # |-- timestamp: timestamp (nullable = true) # |-- value: long (nullable = true) sdf = spark.readStream.format("rate").option("rowsPerSecond", 1).load() sdf = (sdf.withColumn("key", (sdf.value % 5).cast(StringType()).cast( BinaryType())).withColumn("event_timestamp", sdf.timestamp).withColumn( "data", sdf.value.cast(StringType()).cast(BinaryType())).drop( "value", "timestamp")) sdf.printSchema() query = ( sdf.writeStream.format("pubsublite").option( "pubsublite.topic", f"projects/{project_number}/locations/{location}/topics/{topic_id}", ) # Required. Use a unique checkpoint location for each job. .option("checkpointLocation", "/tmp/app" + uuid.uuid4().hex).outputMode("append").trigger( processingTime="1 second").start()) # Wait 60 seconds to terminate the query. query.awaitTermination(60) query.stop()
def _agg_func_template(df, col_name, st_agg_func): import pandas as pd from pyspark.sql.functions import pandas_udf, PandasUDFType from pyspark.sql.types import (StructType, StructField, BinaryType) agg_schema = StructType([StructField('geos', BinaryType(), True)]) @pandas_udf(agg_schema, PandasUDFType.MAP_ITER) def agg_step1(batch_iter, col_name=col_name): for pdf in batch_iter: ret = st_agg_func(pdf[col_name]) df = pd.DataFrame({"geos": [ret[0]]}) yield df @pandas_udf(BinaryType(), PandasUDFType.GROUPED_AGG) def agg_step2(geos): return st_agg_func(geos)[0] agg_df = df.mapInPandas(agg_step1) agg_df = agg_df.coalesce(1) ret = agg_df.agg(agg_step2(agg_df['geos'])).collect()[0][0] return ret
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def create_schema(col_schema: dict) -> StructType: type_mapping = { 'str': StringType(), 'int': IntegerType(), 'float': FloatType(), 'bool': BooleanType(), 'date': DateType(), 'bytes': BinaryType() } schema = [StructField(col_name, type_mapping.get(field_type), True) for col_name, field_type in col_schema.items()] return StructType(fields=schema)
def test_image_round_trip(self): # Test round trip: array -> png -> sparkImg -> array binarySchema = StructType([StructField("data", BinaryType(), False)]) df = self.session.createDataFrame([[bytearray(pngData)]], binarySchema) # Convert to images decImg = udf(imageIO._decodeImage, imageIO.imageSchema) imageDF = df.select(decImg("data").alias("image")) row = imageDF.first() testArray = imageIO.imageStructToArray(row.image) self.assertEqual(testArray.shape, array.shape) self.assertEqual(testArray.dtype, array.dtype) self.assertTrue(np.all(array == testArray))
def choroplethmap(df, vega): from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit from pyspark.sql.types import (StructType, StructField, BinaryType, StringType, IntegerType) from ._wrapper_func import TransformAndProjection coor = vega.coor() bounding_box = vega.bounding_box() height = vega.height() width = vega.width() top_left = 'POINT (' + str(bounding_box[0]) + ' ' + str( bounding_box[3]) + ')' bottom_right = 'POINT (' + str(bounding_box[2]) + ' ' + str( bounding_box[1]) + ')' if (coor != 'EPSG:3857'): df = df.select( TransformAndProjection(col('wkt'), lit(str(coor)), lit('EPSG:3857'), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias("wkb"), col('w')) vega = vega.build() agg_schema = StructType([ StructField('wkb', BinaryType(), True), StructField('w', IntegerType(), True) ]) @pandas_udf(agg_schema, PandasUDFType.MAP_ITER) def render_agg_UDF(batch_iter): for pdf in batch_iter: dd = pdf.groupby(['wkb']) dd = dd['w'].agg(['sum']).reset_index() dd.columns = ['wkb', 'w'] yield dd @pandas_udf("string", PandasUDFType.GROUPED_AGG) def choroplethmap_wkb(wkb, w, conf=vega): from arctern import choropleth_map return choropleth_map(wkb, w, conf.encode('utf-8')) @pandas_udf("double", PandasUDFType.GROUPED_AGG) def sum_udf(v): return v.sum() agg_df = df.where("wkb != ''") agg_df = agg_df.mapInPandas(render_agg_UDF) agg_df = agg_df.coalesce(1) hex_data = agg_df.agg(choroplethmap_wkb(agg_df['wkb'], agg_df['w'])).collect()[0][0] return hex_data
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def sqlType(cls): """ Mirrors `schema` in scala companion object org.apache.spark.sql.rf.TileUDT """ extent = StructType([ StructField("xmin", DoubleType(), True), StructField("ymin", DoubleType(), True), StructField("xmax", DoubleType(), True), StructField("ymax", DoubleType(), True) ]) grid = StructType([ StructField("colMin", IntegerType(), True), StructField("rowMin", IntegerType(), True), StructField("colMax", IntegerType(), True), StructField("rowMax", IntegerType(), True) ]) ref = StructType([ StructField( "source", StructType( [StructField("raster_source_kryo", BinaryType(), False)]), True), StructField("bandIndex", IntegerType(), True), StructField("subextent", extent, True), StructField("subgrid", grid, True), ]) return StructType([ StructField("cellType", StringType(), False), StructField("cols", IntegerType(), False), StructField("rows", IntegerType(), False), StructField("cells", BinaryType(), True), StructField("gridBounds", grid, True), StructField("ref", ref, True) ])
def sqlalchemy_spark_type(sqlalchemy_type: sqlalchemy.types.TypeEngine): if isinstance(sqlalchemy_type, sqlalchemy.types.Binary): return BinaryType() elif isinstance(sqlalchemy_type, sqlalchemy.types.Boolean): return BooleanType() elif isinstance(sqlalchemy_type, sqlalchemy.types.Date): return DateType() elif isinstance(sqlalchemy_type, sqlalchemy.types.DateTime): return TimestampType() elif isinstance(sqlalchemy_type, sqlalchemy.types.Integer): return LongType() elif isinstance(sqlalchemy_type, sqlalchemy.types.String): return StringType() elif isinstance(sqlalchemy_type, sqlalchemy.types.Float): return DoubleType()
def test_primitives(): assert BooleanType() == parse_schema("bool") assert BooleanType() == parse_schema("boolean") assert ByteType() == parse_schema("byte") assert ByteType() == parse_schema("tinyint") assert ShortType() == parse_schema("short") assert ShortType() == parse_schema("smallint") assert IntegerType() == parse_schema("int") assert FloatType() == parse_schema("float") assert DoubleType() == parse_schema("double") assert StringType() == parse_schema("string") assert BinaryType() == parse_schema("binary")
def filesToDF(sc, path, numPartitions=None): """ Read files from a directory to a DataFrame. :param sc: SparkContext. :param path: str, path to files. :param numPartition: int, number or partitions to use for reading files. :return: DataFrame, with columns: (filePath: str, fileData: BinaryType) """ numPartitions = numPartitions or sc.defaultParallelism schema = StructType([StructField("filePath", StringType(), False), StructField("fileData", BinaryType(), False)]) rdd = sc.binaryFiles( path, minPartitions=numPartitions).repartition(numPartitions) rdd = rdd.map(lambda x: (x[0], bytearray(x[1]))) return rdd.toDF(schema)
def run_pyspark_pipeline(dir_path, spark, cores, out_path): """ Reads parquet files from 'dir_path' and parses trec_car_tools.Page object to create protobuf with entity linking. """ print('start preprocessin') start_preprocess = time.time() # Reads parquet files from 'dir_path' - each row is a TREC CAR pages. df_in = spark.read.parquet(dir_path) df_in.printSchema() num_partitions = df_in.rdd.getNumPartitions() print("Number of default partitions: {}".format(num_partitions)) print('end preprocess') end_preprocess = time.time() print("*** preprocess time: {:.2f}s ***".format(end_preprocess - start_preprocess)) print('start pyspark_processing job') start_pyspark_job = time.time() if num_partitions < cores * 4: print('repartitioning df') df_in = df_in.repartition(cores * 4) print("Number of partitions should equal 4*cores --> {}".format( df_in.rdd.getNumPartitions())) @udf(returnType=BinaryType()) def parse_udf(page_bytearray): # Parses trec_car_tools.Page object to create protobuf with entity linking. page = pickle.loads(page_bytearray) tp = TrecCarParser() doc = tp.parse_page_to_protobuf(page=page) doc_bytearray = pickle.dumps(doc.SerializeToString()) return doc_bytearray # Add index to DF. df_parse = df_in.withColumn("doc_bytearray", parse_udf("page_bytearray")) df_parse = df_parse.withColumn( "index", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1) df_parse.write.parquet(out_path) print('end pyspark_processing job') end_pyspark_job = time.time() print("*** pyspark_processing job time: {:.2f}s ***".format( end_pyspark_job - start_pyspark_job))
def _map_field_type(f_type): struct_fields = { "string": StringType(), "binary": BinaryType(), "boolean": BooleanType(), "date": StringType(), "timestamp": StringType(), "decimal": DecimalType(), "double": DoubleType(), "float": FloatType(), "number": StringType(), "byte": ByteType(), "integer": IntegerType(), "long": LongType(), "short": ShortType(), None: StringType() } return struct_fields.get(f_type, StringType())
def test_toPandas_empty_df_arrow_enabled(self): # SPARK-30537 test that toPandas() on an empty dataframe has the correct dtypes # when arrow is enabled from datetime import date from decimal import Decimal schema = StructType([ StructField("a", StringType(), True), StructField("a", IntegerType(), True), StructField("c", TimestampType(), True), StructField("d", NullType(), True), StructField("e", LongType(), True), StructField("f", FloatType(), True), StructField("g", DateType(), True), StructField("h", BinaryType(), True), StructField("i", DecimalType(38, 18), True), StructField("k", TimestampNTZType(), True), StructField("L", DayTimeIntervalType(0, 3), True), ]) df = self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(), schema=schema) non_empty_df = self.spark.createDataFrame( [( "a", 1, datetime.datetime(1969, 1, 1, 1, 1, 1), None, 10, 0.2, date(1969, 1, 1), bytearray(b"a"), Decimal("2.0"), datetime.datetime(1969, 1, 1, 1, 1, 1), datetime.timedelta(microseconds=123), )], schema=schema, ) pdf, pdf_arrow = self._toPandas_arrow_toggle(df) pdf_non_empty, pdf_arrow_non_empty = self._toPandas_arrow_toggle( non_empty_df) assert_frame_equal(pdf, pdf_arrow) self.assertTrue(pdf_arrow.dtypes.equals(pdf_arrow_non_empty.dtypes)) self.assertTrue(pdf_arrow.dtypes.equals(pdf_non_empty.dtypes))
def sqlType(cls) -> StructType: return StructType(fields=[ # dtype field will use dictionary encoding. StructField( "dtype", StringType(), False, ), StructField( "shape", ArrayType(IntegerType(), False), False, ), StructField( "data", BinaryType(), False, ), ])
def _infer_sql_type(k, v): # special handling for binary features if k in binary_features: return BinaryType() if v.int64_list.value: result = v.int64_list.value sql_type = LongType() elif v.float_list.value: result = v.float_list.value sql_type = DoubleType() else: result = v.bytes_list.value sql_type = StringType() if len(result) > 1: # represent multi-item tensors as Spark SQL ArrayType() of base types return ArrayType(sql_type) else: # represent everything else as base types (and empty tensors as StringType()) return sql_type
class DataType(Enum): """Holds constants for data types within Butterfree.""" TIMESTAMP = (TimestampType(), "timestamp") BINARY = (BinaryType(), "boolean") BOOLEAN = (BooleanType(), "boolean") DATE = (DateType(), "timestamp") DECIMAL = (DecimalType(), "decimal") DOUBLE = (DoubleType(), "double") FLOAT = (FloatType(), "float") INTEGER = (IntegerType(), "int") BIGINT = (LongType(), "bigint") STRING = (StringType(), "text") ARRAY_BIGINT = (ArrayType(LongType()), "frozen<list<bigint>>") ARRAY_STRING = (ArrayType(StringType()), "frozen<list<text>>") ARRAY_FLOAT = (ArrayType(FloatType()), "frozen<list<float>>") def __init__(self, spark, cassandra): self.spark = spark self.cassandra = cassandra
def sqlType(cls) -> StructType: return StructType( fields=[ StructField( "dtype", ShortType(), False, ), StructField( "shape", ArrayType(IntegerType(), False), False, ), StructField( "data", BinaryType(), False, ), ] )
def sqlType(cls): """ Mirrors `schema` in scala companion object org.apache.spark.sql.rf.TileUDT """ return StructType([ StructField( "cell_context", StructType([ StructField( "cellType", StructType( [StructField("cellTypeName", StringType(), False)]), False), StructField( "dimensions", StructType([ StructField("cols", ShortType(), False), StructField("rows", ShortType(), False) ]), False), ]), False), StructField( "cell_data", StructType([ StructField("cells", BinaryType(), True), StructField( "ref", StructType([ StructField("source", RasterSourceUDT(), False), StructField("bandIndex", IntegerType(), False), StructField( "subextent", StructType([ StructField("xmin", DoubleType(), False), StructField("ymin", DoubleType(), False), StructField("xmax", DoubleType(), False), StructField("ymax", DoubleType(), False) ]), True) ]), True) ]), False) ])
def test13(spark): # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol. # It should be selected at random by each process that writes records. schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples5') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option("stream", "video").load()) @udf(returnType=IntegerType()) def parse_chunk_index(event): unpacked = struct.unpack('!bxxxhh', event[0:8]) return unpacked[1] @udf(returnType=IntegerType()) def parse_final_chunk_index(event): unpacked = struct.unpack('!bxxxhh', event[0:8]) return unpacked[2] @udf(returnType=BinaryType()) def parse_payload(event): return event[8:] df = df.select('*', parse_chunk_index('event').alias('chunk_index')) df = df.select('*', parse_final_chunk_index('event').alias('final_chunk_index')) df = df.select('*', parse_payload('event').alias('payload')) df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console') # .option('truncate', 'false') .start().awaitTermination())