s = io.BytesIO(binary) r = sr.Recognizer() with sr.AudioFile(s) as source: audio = r.record(source) try: print("Transcribing...") text = r.recognize_sphinx(audio) print("Done!") return text except: msg = "no_transcription_available" print("Darn! Could not transcribe audio.") return msg sttudf = udf(lambda z:recognize(z), StringType()) splitudf = udf(lambda x: splitWav(x), ArrayType(BinaryType())) convertudf = udf(lambda x: convertToWav(x), BinaryType()) df = spark.read.format("binaryFile").option("pathGlobFilter", "DTNS*.mp3").option("recursiveFileLookup", "true").load("s3a://jordan-podcast-s3/") df = df.withColumn("WAVAudio", convertudf(df.content)).drop("modificationTime","length","content") df = df.withColumn("splitwavs", splitudf(df.WAVAudio)).drop("WAVAudio") df = df.withColumn("splitwavs", explode(df.splitwavs)) df = df.repartition(36) df = df.withColumn("transcriptions", sttudf(df.splitwavs)).drop("splitwavs") df = df.groupby("path").agg(collect_list('transcriptions').alias("transcriptions")) df = df.withColumn("transcriptions", concat_ws(" ", "transcriptions")) df.write.format('org.elasticsearch.spark.sql')\ .option('es.nodes', '10.0.0.6:9200, 10.0.0.14:9200, 10.0.0.10:9200')\ .option('es.port', 9200)\ .option('es.resource', "podcast2/test")\ .save()
def sqlType(cls): return StructType([StructField("wkb", BinaryType(), True)])
def prepare_vad_udf(num_padding_frames, threshold, aggressiveness, frame_duration_ms): # Each audio file returns multiple voiced fragments. I need an Array, don't I? return_type = StructType( [ StructField("start_ms", ArrayType(IntegerType())), StructField("end_ms", ArrayType(IntegerType())), StructField("voiced_buffer", ArrayType(BinaryType())), ] ) # Try using ArrayType(BinaryType()). Need to convert numpy array to bytearray # Need a java UDF to reinterpet bytes, it seems https://stackoverflow.com/a/57848517 # Or I could just use np.ndarray.view(np.int8) right here. AUDIO_FORMAT = AudioFormat(sample_rate=16_000, channels=1, sample_byte_width=2) FRAME_DURATION_SAMPLES = (AUDIO_FORMAT.sample_rate * frame_duration_ms) // 1000 FRAME_DURATION_BYTES = ( FRAME_DURATION_SAMPLES * AUDIO_FORMAT.channels * AUDIO_FORMAT.sample_byte_width ) @pandas_udf(return_type) def vad( audio_series: pd.Series, audio_types_series: pd.Series, audio_document_id_series: pd.Series, ) -> pd.DataFrame: df_rows = [] for audio_buffer, audio_type, audio_document_id in zip( audio_series, audio_types_series, audio_document_id_series ): wav_bytes_buffer = BytesIO(DecodeToWavPipe(audio_buffer, audio_type)) with wave.open(wav_bytes_buffer, "rb") as fh: num_frames = fh.getnframes() assert fh.getframerate() == AUDIO_FORMAT.sample_rate assert fh.getnchannels() == AUDIO_FORMAT.channels assert fh.getsampwidth() == AUDIO_FORMAT.sample_byte_width pcm_buffer = fh.readframes(num_frames) del wav_bytes_buffer num_frames = len(pcm_buffer) // FRAME_DURATION_BYTES buffers = [ pcm_buffer[ FRAME_DURATION_BYTES * i : FRAME_DURATION_BYTES * (i + 1) ] for i in range(num_frames) ] del pcm_buffer generator = vad_split( buffers, AUDIO_FORMAT, num_padding_frames, threshold, aggressiveness ) voiced_buffer_list, start_ms_list, end_ms_list = [], [], [] total_serialized_bytes = 0 for voiced_buffer, start_ms, end_ms in generator: total_serialized_bytes += 2 * len(voiced_buffer) if ( total_serialized_bytes > 2 * 1024 * 1024 * 1024 - 1024 * 1024 * 1024 ): two_sum = lambda x, y: (sum(x), sum(y)) ignored_bytes = 0 ignored_ms = 0.0 for voiced_buffer, start_ms, end_ms in generator: ignored_bytes += len(voiced_buffer) ignored_ms += end_ms - start_ms ignored_gigabytes = ((ignored_bytes / 1024) / 1024) / 1024 ignored_hours = ((ignored_ms / 1000) / 60) / 60 print( f"WARNING: truncating voice-activity-detected audio to less than 2GB for {audio_document_id}. Wasted {ignored_gigabytes}GB of data. Wasted {ignored_hours} hours of data." ) break voiced_buffer_list.append(voiced_buffer) start_ms_list.append(start_ms) end_ms_list.append(end_ms) del buffers # mb_total = sum(voiced_buffer.nbytes / 1024 / 1024 for voiced_buffer in voiced_buffer_list) # print("GALVEZ: Chunk size in MB: ", mb_total) df_rows.append( { "start_ms": start_ms_list, "end_ms": end_ms_list, "voiced_buffer": voiced_buffer_list, } ) return pd.DataFrame(df_rows) return vad
def test_as_spark_type_koalas_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(koalas_dtype(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): koalas_dtype(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): koalas_dtype(np.dtype("object"))
lz4_clevel = 1 # this is a UDF that takes care of summing histograms across # various spark results where the outputs are histogram blobs def agg_histos_raw(series, processor_instance, lz4_clevel): goodlines = series[series.str.len() > 0] if goodlines.size == 1: # short-circuit trivial aggregations return goodlines[0] outhist = processor_instance.accumulator.identity() for line in goodlines: outhist.add(pkl.loads(lz4f.decompress(line))) return lz4f.compress(pkl.dumps(outhist), compression_level=lz4_clevel) @fn.pandas_udf(BinaryType(), fn.PandasUDFType.GROUPED_AGG) def agg_histos(series): global processor_instance, lz4_clevel return agg_histos_raw(series, processor_instance, lz4_clevel) def reduce_histos_raw(df, processor_instance, lz4_clevel): histos = df['histos'] mask = (histos.str.len() > 0) outhist = processor_instance.accumulator.identity() for line in histos[mask]: outhist.add(pkl.loads(lz4f.decompress(line))) return pd.DataFrame(data={'histos': np.array([lz4f.compress(pkl.dumps(outhist), compression_level=lz4_clevel)], dtype='O')}) @fn.pandas_udf(StructType([StructField('histos', BinaryType(), True)]), fn.PandasUDFType.GROUPED_MAP)
def test_verify_type_not_nullable(self): import array import datetime import decimal schema = StructType([ StructField('s', StringType(), nullable=False), StructField('i', IntegerType(), nullable=True)]) class MyObj: def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) # obj, data_type success_spec = [ # String ("", StringType()), (u"", StringType()), (1, StringType()), (1.0, StringType()), ([], StringType()), ({}, StringType()), # UDT (ExamplePoint(1.0, 2.0), ExamplePointUDT()), # Boolean (True, BooleanType()), # Byte (-(2**7), ByteType()), (2**7 - 1, ByteType()), # Short (-(2**15), ShortType()), (2**15 - 1, ShortType()), # Integer (-(2**31), IntegerType()), (2**31 - 1, IntegerType()), # Long (-(2**63), LongType()), (2**63 - 1, LongType()), # Float & Double (1.0, FloatType()), (1.0, DoubleType()), # Decimal (decimal.Decimal("1.0"), DecimalType()), # Binary (bytearray([1, 2]), BinaryType()), # Date/Timestamp (datetime.date(2000, 1, 2), DateType()), (datetime.datetime(2000, 1, 2, 3, 4), DateType()), (datetime.datetime(2000, 1, 2, 3, 4), TimestampType()), # Array ([], ArrayType(IntegerType())), (["1", None], ArrayType(StringType(), containsNull=True)), ([1, 2], ArrayType(IntegerType())), ((1, 2), ArrayType(IntegerType())), (array.array('h', [1, 2]), ArrayType(IntegerType())), # Map ({}, MapType(StringType(), IntegerType())), ({"a": 1}, MapType(StringType(), IntegerType())), ({"a": None}, MapType(StringType(), IntegerType(), valueContainsNull=True)), # Struct ({"s": "a", "i": 1}, schema), ({"s": "a", "i": None}, schema), ({"s": "a"}, schema), ({"s": "a", "f": 1.0}, schema), (Row(s="a", i=1), schema), (Row(s="a", i=None), schema), (["a", 1], schema), (["a", None], schema), (("a", 1), schema), (MyObj(s="a", i=1), schema), (MyObj(s="a", i=None), schema), (MyObj(s="a"), schema), ] # obj, data_type, exception class failure_spec = [ # String (match anything but None) (None, StringType(), ValueError), # UDT (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError), # Boolean (1, BooleanType(), TypeError), ("True", BooleanType(), TypeError), ([1], BooleanType(), TypeError), # Byte (-(2**7) - 1, ByteType(), ValueError), (2**7, ByteType(), ValueError), ("1", ByteType(), TypeError), (1.0, ByteType(), TypeError), # Short (-(2**15) - 1, ShortType(), ValueError), (2**15, ShortType(), ValueError), # Integer (-(2**31) - 1, IntegerType(), ValueError), (2**31, IntegerType(), ValueError), # Float & Double (1, FloatType(), TypeError), (1, DoubleType(), TypeError), # Decimal (1.0, DecimalType(), TypeError), (1, DecimalType(), TypeError), ("1.0", DecimalType(), TypeError), # Binary (1, BinaryType(), TypeError), # Date/Timestamp ("2000-01-02", DateType(), TypeError), (946811040, TimestampType(), TypeError), # Array (["1", None], ArrayType(StringType(), containsNull=False), ValueError), ([1, "2"], ArrayType(IntegerType()), TypeError), # Map ({"a": 1}, MapType(IntegerType(), IntegerType()), TypeError), ({"a": "1"}, MapType(StringType(), IntegerType()), TypeError), ({"a": None}, MapType(StringType(), IntegerType(), valueContainsNull=False), ValueError), # Struct ({"s": "a", "i": "1"}, schema, TypeError), (Row(s="a"), schema, ValueError), # Row can't have missing field (Row(s="a", i="1"), schema, TypeError), (["a"], schema, ValueError), (["a", "1"], schema, TypeError), (MyObj(s="a", i="1"), schema, TypeError), (MyObj(s=None, i="1"), schema, ValueError), ] # Check success cases for obj, data_type in success_spec: try: _make_type_verifier(data_type, nullable=False)(obj) except Exception: self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type)) # Check failure cases for obj, data_type, exp in failure_spec: msg = "verify_type(%s, %s, nullable=False) == %s" % (obj, data_type, exp) with self.assertRaises(exp, msg=msg): _make_type_verifier(data_type, nullable=False)(obj)
def weighted_pointmap(vega, df): if df.rdd.isEmpty(): return None if len(df.schema.names) == 1: col_point = df.schema.names[0] render_mode = 0 elif len(df.schema.names) == 2: col_point = df.schema.names[0] col_count = df.schema.names[1] render_mode = 1 elif len(df.schema.names) == 3: col_point = df.schema.names[0] col_color = df.schema.names[1] col_stroke = df.schema.names[2] render_mode = 2 else: return None from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit from pyspark.sql.types import (StructType, StructField, BinaryType, IntegerType) from ._wrapper_func import TransformAndProjection, Projection bounding_box = vega.bounding_box() top_left = 'POINT (' + str(bounding_box[0]) + ' ' + str( bounding_box[3]) + ')' bottom_right = 'POINT (' + str(bounding_box[2]) + ' ' + str( bounding_box[1]) + ')' height = vega.height() width = vega.width() coor = vega.coor() aggregation_type = vega.aggregation_type() if coor == 'EPSG:3857': if render_mode == 2: df = df.select( Projection(col(col_point), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_point), col(col_color), col(col_stroke)) agg_schema = StructType([ StructField(col_point, BinaryType(), True), StructField(col_color, IntegerType(), True), StructField(col_stroke, IntegerType(), True) ]) @pandas_udf(agg_schema, PandasUDFType.MAP_ITER) def render_agg_UDF_3857_2(batch_iter): for pdf in batch_iter: dd = pdf.groupby([col_point]) ll = [col_color, col_stroke] dd = dd[ll].agg([aggregation_type]).reset_index() dd.columns = [col_point, col_color, col_stroke] yield dd @pandas_udf("string", PandasUDFType.GROUPED_AGG) def weighted_pointmap_wkb_3857_2(point, c, s, conf=vega): from arctern import weighted_point_map_layer return weighted_point_map_layer(conf, point, False, color_weights=c, size_weights=s) agg_df = df.mapInPandas(render_agg_UDF_3857_2) agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF() hex_data = agg_df.agg( weighted_pointmap_wkb_3857_2( agg_df[col_point], agg_df[col_color], agg_df[col_stroke])).collect()[0][0] elif render_mode == 1: df = df.select( Projection(col(col_point), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_point), col(col_count)) agg_schema = StructType([ StructField(col_point, BinaryType(), True), StructField(col_count, IntegerType(), True) ]) @pandas_udf(agg_schema, PandasUDFType.MAP_ITER) def render_agg_UDF_3857_1(batch_iter): for pdf in batch_iter: dd = pdf.groupby([col_point]) dd = dd[col_count].agg([aggregation_type]).reset_index() dd.columns = [col_point, col_count] yield dd @pandas_udf("string", PandasUDFType.GROUPED_AGG) def weighted_pointmap_wkb_3857_1(point, c, conf=vega): from arctern import weighted_point_map_layer return weighted_point_map_layer(conf, point, False, color_weights=c) agg_df = df.mapInPandas(render_agg_UDF_3857_1) agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF() hex_data = agg_df.agg( weighted_pointmap_wkb_3857_1( agg_df[col_point], agg_df[col_count])).collect()[0][0] else: df = df.select( Projection(col(col_point), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_point)) @pandas_udf("string", PandasUDFType.GROUPED_AGG) def weighted_pointmap_wkb(point, conf=vega): from arctern import weighted_point_map_layer return weighted_point_map_layer(conf, point, False) df = df.rdd.coalesce(1, shuffle=True).toDF() hex_data = df.agg(weighted_pointmap_wkb( df[col_point])).collect()[0][0] return hex_data if render_mode == 2: df = df.select( TransformAndProjection(col(col_point), lit(str(coor)), lit('EPSG:3857'), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_point), col(col_color), col(col_stroke)) agg_schema = StructType([ StructField(col_point, BinaryType(), True), StructField(col_color, IntegerType(), True), StructField(col_stroke, IntegerType(), True) ]) @pandas_udf(agg_schema, PandasUDFType.MAP_ITER) def render_agg_UDF_2(batch_iter): for pdf in batch_iter: dd = pdf.groupby([col_point]) ll = [col_color, col_stroke] dd = dd[ll].agg([aggregation_type]).reset_index() dd.columns = [col_point, col_color, col_stroke] yield dd @pandas_udf("string", PandasUDFType.GROUPED_AGG) def weighted_pointmap_wkb_2(point, c, s, conf=vega): from arctern import weighted_point_map_layer return weighted_point_map_layer(conf, point, False, color_weights=c, size_weights=s) agg_df = df.mapInPandas(render_agg_UDF_2) agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF() hex_data = agg_df.agg( weighted_pointmap_wkb_2(agg_df[col_point], agg_df[col_color], agg_df[col_stroke])).collect()[0][0] elif render_mode == 1: df = df.select( TransformAndProjection(col(col_point), lit(str(coor)), lit('EPSG:3857'), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_point), col(col_count)) agg_schema = StructType([ StructField(col_point, BinaryType(), True), StructField(col_count, IntegerType(), True) ]) @pandas_udf(agg_schema, PandasUDFType.MAP_ITER) def render_agg_UDF_1(batch_iter): for pdf in batch_iter: dd = pdf.groupby([col_point]) dd = dd[col_count].agg([aggregation_type]).reset_index() dd.columns = [col_point, col_count] yield dd @pandas_udf("string", PandasUDFType.GROUPED_AGG) def weighted_pointmap_wkb_1(point, c, conf=vega): from arctern import weighted_point_map_layer return weighted_point_map_layer(conf, point, False, color_weights=c) agg_df = df.mapInPandas(render_agg_UDF_1) agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF() hex_data = agg_df.agg( weighted_pointmap_wkb_1(agg_df[col_point], agg_df[col_count])).collect()[0][0] else: df = df.select( TransformAndProjection(col(col_point), lit(str(coor)), lit('EPSG:3857'), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_point)) @pandas_udf("string", PandasUDFType.GROUPED_AGG) def weighted_pointmap_wkb_0(point, conf=vega): from arctern import weighted_point_map_layer return weighted_point_map_layer(conf, point, False) df = df.rdd.coalesce(1, shuffle=True).toDF() hex_data = df.agg(weighted_pointmap_wkb_0( df[col_point])).collect()[0][0] return hex_data
def spark_dtype(self): return BinaryType()
pdf['width'] = 128 pdf['nChannels'] = 4 pdf['mode'] = 24 pdf['data'] = np.asarray(b'this is binary data') yield pdf schema = StructType([ StructField('path', StringType(), True), # input StructField('slice', IntegerType(), True), # intermediate StructField('xtile', IntegerType(), True), # intermediate StructField('ytile', IntegerType(), True), # intermediate StructField('origin', StringType(), True), # output StructField('height', IntegerType(), False), StructField('width', IntegerType(), False), StructField('nChannels', IntegerType(), False), StructField('mode', IntegerType(), False), StructField('data', BinaryType(), False) ]) # Integration test df = spark.range(4, numPartitions=10).withColumn( 'path', expr("concat('s3a://this/is/my/object/path_czi', string(id), '.czi')")) #Dataframe.mapInPandas returns zero, one or more rows for every input dfx = df.mapInPandas(pandas_czi_splitter, schema=schema) dfx.count() display(dfx)
def choroplethmap(vega, df): if df.rdd.isEmpty(): return None if len(df.schema.names) != 2: return None col_polygon = df.schema.names[0] col_count = df.schema.names[1] from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit from pyspark.sql.types import (StructType, StructField, BinaryType, IntegerType) from ._wrapper_func import TransformAndProjection, Projection bounding_box = vega.bounding_box() top_left = 'POINT (' + str(bounding_box[0]) + ' ' + str( bounding_box[3]) + ')' bottom_right = 'POINT (' + str(bounding_box[2]) + ' ' + str( bounding_box[1]) + ')' height = vega.height() width = vega.width() coor = vega.coor() aggregation_type = vega.aggregation_type() if coor != 'EPSG:3857': df = df.select( TransformAndProjection(col(col_polygon), lit(str(coor)), lit('EPSG:3857'), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_polygon), col(col_count)) else: df = df.select( Projection(col(col_polygon), lit(bottom_right), lit(top_left), lit(int(height)), lit(int(width))).alias(col_polygon), col(col_count)) agg_schema = StructType([ StructField(col_polygon, BinaryType(), True), StructField(col_count, IntegerType(), True) ]) @pandas_udf(agg_schema, PandasUDFType.MAP_ITER) def render_agg_UDF(batch_iter): for pdf in batch_iter: dd = pdf.groupby([col_polygon]) dd = dd[col_count].agg([aggregation_type]).reset_index() dd.columns = [col_polygon, col_count] yield dd @pandas_udf("string", PandasUDFType.GROUPED_AGG) def choroplethmap_wkb(wkb, w, conf=vega): from arctern import choropleth_map_layer return choropleth_map_layer(conf, wkb, w, False) agg_df = df.mapInPandas(render_agg_UDF) agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF() hex_data = agg_df.agg( choroplethmap_wkb(agg_df[col_polygon], agg_df[col_count])).collect()[0][0] return hex_data
#arr = rdd.take(1)[0] # #Image.open(BytesIO(arr)) # COMMAND ---------- from PIL import Image from io import BytesIO from pyspark.sql.types import BinaryType, StructType, StructField from functools import partial rdd = fin.flatMap( partial(msg_map, func=lambda r: r.data, conn=conn_d['/center_camera/image_color/compressed']) ) rddTuple = rdd.map(lambda x: (bytearray(x),)) schema = StructType([StructField('rawdata', BinaryType(), False)]) df = rddTuple.toDF(schema) df.cache() # COMMAND ---------- from sparkdl.image.imageIO import PIL_decode, imageArrayToStruct from pyspark.sql.functions import col from pyspark.ml.image import ImageSchema imageUdf = udf(lambda b: imageArrayToStruct(PIL_decode(b)), ImageSchema.imageSchema['image'].dataType) img = df.withColumn('image', imageUdf(col('rawdata'))) display(img.select('image'))
# COMMAND ---------- # Start Timing start_download = timer() # COMMAND ---------- # DBTITLE 1,Whole Download Pipeline from pyspark.sql.types import StructField, BinaryType, StructType, StringType, IntegerType # Define Schema for output table schema = StructType(fields=[ StructField('id', StringType(), True), StructField('img_binary', BinaryType(), True), StructField('img_size', IntegerType(), True), StructField('img_width', IntegerType(), True), StructField('img_height', IntegerType(), True), StructField('error_code', StringType(), True), ]) df_dl_links = spark.read.parquet(INPUT_FILE) print("About to process {0} rows".format(df_dl_links.count())) # Sort the dataframe along id df_dl_links = df_dl_links.sort("id") # Load the previously downloaded images (if job restarted) if os.path.isdir("/dbfs" + OUTPUT_FILE): download_history_df = spark.read.parquet(OUTPUT_FILE) else:
_SPARK_TYPE_MAPPING = { "bool": BooleanType(), "boolean": BooleanType(), "byte": ByteType(), "tinyint": ByteType(), "short": ShortType(), "smallint": ShortType(), "int": IntegerType(), "long": LongType(), "bigint": LongType(), "float": FloatType(), "double": DoubleType(), "str": StringType(), "string": StringType(), "binary": BinaryType(), } class SchemaError(Exception): def __init__(self, message: str): self.message = message class SchemaBuilder(RikaiModelSchemaVisitor): def visitStructType( self, ctx: RikaiModelSchemaParser.StructTypeContext) -> StructType: return StructType( [self.visitStructField(field) for field in ctx.field()]) def visitStructField(
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType: """Convert pyarrow type to Spark data type.""" from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types spark_type: DataType if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None: spark_type = TimestampNTZType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_duration(at): spark_type = DayTimeIntervalType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ] ) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def spark_streaming_to_pubsublite( project_number: int, location: str, topic_id: str ) -> None: # [START pubsublite_spark_streaming_to_pubsublite] from pyspark.sql import SparkSession from pyspark.sql.functions import array, create_map, col, lit, when from pyspark.sql.types import BinaryType, StringType import uuid # TODO(developer): # project_number = 11223344556677 # location = "us-central1-a" # topic_id = "your-topic-id" spark = SparkSession.builder.appName("write-app").getOrCreate() # Create a RateStreamSource that generates consecutive numbers with timestamps: # |-- timestamp: timestamp (nullable = true) # |-- value: long (nullable = true) sdf = spark.readStream.format("rate").option("rowsPerSecond", 1).load() # Transform the dataframe to match the required data fields and data types: # https://github.com/googleapis/java-pubsublite-spark#data-schema sdf = ( sdf.withColumn("key", lit("example").cast(BinaryType())) .withColumn("data", col("value").cast(StringType()).cast(BinaryType())) .withColumnRenamed("timestamp", "event_timestamp") # Populate the attributes field. For example, an even value will # have {"key1", [b"even"]}. .withColumn( "attributes", create_map( lit("key1"), array(when(col("value") % 2 == 0, b"even").otherwise(b"odd")), ), ) .drop("value") ) # After the transformation, the schema of the dataframe should look like: # |-- key: binary (nullable = false) # |-- data: binary (nullable = true) # |-- event_timestamp: timestamp (nullable = true) # |-- attributes: map (nullable = false) # | |-- key: string # | |-- value: array (valueContainsNull = false) # | | |-- element: binary (containsNull = false) sdf.printSchema() query = ( sdf.writeStream.format("pubsublite") .option( "pubsublite.topic", f"projects/{project_number}/locations/{location}/topics/{topic_id}", ) # Required. Use a unique checkpoint location for each job. .option("checkpointLocation", "/tmp/app" + uuid.uuid4().hex) .outputMode("append") .trigger(processingTime="1 second") .start() ) # Wait 60 seconds to terminate the query. query.awaitTermination(60) query.stop()
def sqlType(cls): return StructType( [StructField("raster_source_kryo", BinaryType(), False)])
def test_as_spark_type(self): type_mapper = { # binary np.character: BinaryType(), np.bytes_: BinaryType(), np.string_: BinaryType(), bytes: BinaryType(), # integer np.int8: ByteType(), np.byte: ByteType(), np.int16: ShortType(), np.int32: IntegerType(), np.int64: LongType(), np.int: LongType(), int: LongType(), # floating np.float32: FloatType(), np.float: DoubleType(), np.float64: DoubleType(), float: DoubleType(), # string np.str: StringType(), np.unicode_: StringType(), str: StringType(), # bool np.bool: BooleanType(), bool: BooleanType(), # datetime np.datetime64: TimestampType(), datetime.datetime: TimestampType(), # DateType datetime.date: DateType(), # DecimalType decimal.Decimal: DecimalType(38, 18), # ArrayType np.ndarray: ArrayType(StringType()), List[bytes]: ArrayType(BinaryType()), List[np.character]: ArrayType(BinaryType()), List[np.bytes_]: ArrayType(BinaryType()), List[np.string_]: ArrayType(BinaryType()), List[bool]: ArrayType(BooleanType()), List[np.bool]: ArrayType(BooleanType()), List[datetime.date]: ArrayType(DateType()), List[np.int8]: ArrayType(ByteType()), List[np.byte]: ArrayType(ByteType()), List[decimal.Decimal]: ArrayType(DecimalType(38, 18)), List[float]: ArrayType(DoubleType()), List[np.float]: ArrayType(DoubleType()), List[np.float64]: ArrayType(DoubleType()), List[np.float32]: ArrayType(FloatType()), List[np.int32]: ArrayType(IntegerType()), List[int]: ArrayType(LongType()), List[np.int]: ArrayType(LongType()), List[np.int64]: ArrayType(LongType()), List[np.int16]: ArrayType(ShortType()), List[str]: ArrayType(StringType()), List[np.unicode_]: ArrayType(StringType()), List[datetime.datetime]: ArrayType(TimestampType()), List[np.datetime64]: ArrayType(TimestampType()), } for numpy_or_python_type, spark_type in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64"))
__null_type: NullType = NullType() _NULL_TYPE: str = __null_type.simpleString() assert _NULL_TYPE == __null_type.typeName() __bool_type: BooleanType = BooleanType() _BOOL_TYPE: str = __bool_type.simpleString() assert _BOOL_TYPE == __bool_type.typeName() __str_type: StringType = StringType() _STR_TYPE: str = __str_type.simpleString() assert _STR_TYPE == __str_type.typeName() __binary_type: BinaryType = BinaryType() _BINARY_TYPE: str = __binary_type.simpleString() assert _BINARY_TYPE == __binary_type.typeName() __byte_type: ByteType = ByteType() _TINYINT_TYPE: str = __byte_type.simpleString() __short_type: ShortType = ShortType() _SMALLINT_TYPE: str = __short_type.simpleString() __int_type: IntegerType = IntegerType() _INT_TYPE: str = __int_type.simpleString() assert _INT_TYPE == int.__name__ assert __int_type.typeName().startswith(_INT_TYPE)
def sqlType(self): return StructField("wkb", BinaryType(), False)
"array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType, "vector": VectorUDT } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } # Profiler PROFILER_COLUMN_TYPES = { "categorical", "numeric", "date", "null", "array", "binary" } SPARK_DTYPES_TO_PROFILER = { "int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"], "string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary", "array": "array",
_array_type_mappings, _acceptable_types) __null_type = NullType() _NULL_TYPE = __null_type.simpleString() assert _NULL_TYPE == __null_type.typeName() __bool_type = BooleanType() _BOOL_TYPE = __bool_type.simpleString() assert _BOOL_TYPE == __bool_type.typeName() __str_type = StringType() _STR_TYPE = __str_type.simpleString() assert _STR_TYPE == __str_type.typeName() __binary_type = BinaryType() _BINARY_TYPE = __binary_type.simpleString() assert _BINARY_TYPE == __binary_type.typeName() __byte_type = ByteType() _TINYINT_TYPE = __byte_type.simpleString() __short_type = ShortType() _SMALLINT_TYPE = __short_type.simpleString() __int_type = IntegerType() _INT_TYPE = __int_type.simpleString() assert _INT_TYPE == int.__name__ assert __int_type.typeName().startswith(_INT_TYPE) __long_type = LongType()
# WARNING: Assumes that your return type default constructor returns a "reasonable" value. # May return None instead? duration = python_return_type() except subprocess.TimeoutExpired: print(f"Restarting on {audio_file}") # Call again. Sometimes gcsfuse just stalls, so we need restartability return get_soxi_info_udf(audio_file_series) durations.append(duration) return pd.Series(durations) return get_soxi_info_udf get_audio_seconds_udf = _prepare_soxi_udf("-D", DoubleType(), float) get_audio_sample_rate_udf = _prepare_soxi_udf("-r", StringType(), str) get_audio_annotations_udf = _prepare_soxi_udf("-a", BinaryType(), bytes) # Can I return an array type of struct types? AUDIO_SEGMENTS_RETURN_TYPE = T.StructType( [ T.StructField("audio_name", T.ArrayType(T.StringType())), T.StructField("audio", T.ArrayType(T.BinaryType())), ] ) @F.pandas_udf(AUDIO_SEGMENTS_RETURN_TYPE) def create_audio_segments_udf( audio_bytes_series: pd.Series, audio_type_series: pd.Series, audio_name_series: pd.Series,
def test_as_spark_type_pandas_on_spark_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(pandas_on_spark_type(numpy_or_python_type), (dtype, spark_type)) if isinstance(numpy_or_python_type, CategoricalDtype): # Nested CategoricalDtype is not yet supported. continue self.assertEqual(as_spark_type(List[numpy_or_python_type]), ArrayType(spark_type)) self.assertEqual( pandas_on_spark_type(List[numpy_or_python_type]), (np.dtype("object"), ArrayType(spark_type)), ) # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ if sys.version_info >= (3, 8) and LooseVersion( np.__version__) >= LooseVersion("1.21"): import numpy.typing as ntp self.assertEqual( as_spark_type(ntp.NDArray[numpy_or_python_type]), ArrayType(spark_type)) self.assertEqual( pandas_on_spark_type(ntp.NDArray[numpy_or_python_type]), (np.dtype("object"), ArrayType(spark_type)), ) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): pandas_on_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): pandas_on_spark_type(np.dtype("object"))
def setUpClass(cls): from datetime import date, datetime from decimal import Decimal super(ArrowTests, cls).setUpClass() cls.warnings_lock = threading.Lock() # Synchronize default timezone between Python and Java cls.tz_prev = os.environ.get("TZ", None) # save current tz if set tz = "America/Los_Angeles" os.environ["TZ"] = tz time.tzset() cls.spark.conf.set("spark.sql.session.timeZone", tz) # Test fallback cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "false" cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false" # Enable Arrow optimization in this tests. cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # Disable fallback by default to easily detect the failures. cls.spark.conf.set( "spark.sql.execution.arrow.pyspark.fallback.enabled", "false") cls.schema_wo_null = StructType([ StructField("1_str_t", StringType(), True), StructField("2_int_t", IntegerType(), True), StructField("3_long_t", LongType(), True), StructField("4_float_t", FloatType(), True), StructField("5_double_t", DoubleType(), True), StructField("6_decimal_t", DecimalType(38, 18), True), StructField("7_date_t", DateType(), True), StructField("8_timestamp_t", TimestampType(), True), StructField("9_binary_t", BinaryType(), True), ]) cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True) cls.data_wo_null = [ ( "a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a"), ), ( "b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb"), ), ( "c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc"), ), ( "d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12), datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd"), ), ] cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, BinaryType field = [ StructField("record_id", LongType(), True), StructField("op", IntegerType(), True), StructField("conn", IntegerType(), True), StructField("time", LongType(), True), StructField("topic", StringType(), True), StructField("dtype", StringType(), True), StructField("header", StringType(), True), StructField( "data", StructType([ StructField('message_definition', StringType(), True), StructField('md5sum', StringType(), True), StructField('msg_raw', BinaryType(), True), ])) ] dfSchema = StructType(field) sorted_fields = sorted(dfSchema.fields, key=lambda x: x.name) sorted_schema = StructType(fields=sorted_fields) df_records = sqlContext.createDataFrame(sc.emptyRDD(), sorted_schema) # COMMAND ---------- from pyspark.sql import Row def convert_to_row(rid, opid, connid, dheader, ddata):
def test_supported_types(self): values = [ 1, 2, 3, 4, 5, 1.1, 2.2, Decimal(1.123), [1, 2, 2], True, 'hello', bytearray([0x01, 0x02]) ] output_fields = [('id', IntegerType()), ('byte', ByteType()), ('short', ShortType()), ('int', IntegerType()), ('long', LongType()), ('float', FloatType()), ('double', DoubleType()), ('decim', DecimalType(10, 3)), ('array', ArrayType(IntegerType())), ('bool', BooleanType()), ('str', StringType()), ('bin', BinaryType())] output_schema = StructType([StructField(*x) for x in output_fields]) df = self.spark.createDataFrame([values], schema=output_schema) # Different forms of group map pandas UDF, results of these are the same udf1 = pandas_udf( lambda pdf: pdf.assign(byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin), output_schema, PandasUDFType.GROUPED_MAP) udf2 = pandas_udf( lambda _, pdf: pdf.assign(byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin), output_schema, PandasUDFType.GROUPED_MAP) udf3 = pandas_udf( lambda key, pdf: pdf.assign(id=key[0], byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin), output_schema, PandasUDFType.GROUPED_MAP) result1 = df.groupby('id').apply(udf1).sort('id').toPandas() expected1 = df.toPandas().groupby('id').apply( udf1.func).reset_index(drop=True) result2 = df.groupby('id').apply(udf2).sort('id').toPandas() expected2 = expected1 result3 = df.groupby('id').apply(udf3).sort('id').toPandas() expected3 = expected1 assert_frame_equal(expected1, result1) assert_frame_equal(expected2, result2) assert_frame_equal(expected3, result3)
train_df.show() test_df.show() # Under the hood, each of the partitions is fully loaded in memory, which may be expensive. # This ensure that each of the paritions has a small size. train_df = train_df.repartition(100) test_df = test_df.repartition(100) imageSchema = StructType([ StructField("origin", StringType(), True), StructField("height", IntegerType(), False), StructField("width", IntegerType(), False), StructField("nChannels", IntegerType(), False), StructField("mode", IntegerType(), False), StructField("data", BinaryType(), False) ]) schema = StructType([ StructField("image", imageSchema), StructField("label", IntegerType(), False) ]) image_df = (train_df.rdd.map(create_image_dataframe).toDF(schema)) image_df.show() image_df.printSchema() image_df.select("image.*").show() image_df.select("image.data").show()