def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes, ): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date, ): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (): return types.ArrayType(types.StringType())
def infer_spark_type(typeclass) -> t.DataType: if typeclass in (None, NoneType): return t.NullType() elif typeclass is str: return t.StringType() elif typeclass in {bytes, bytearray}: return t.BinaryType() elif typeclass is bool: return t.BooleanType() elif typeclass is date: return t.DateType() elif typeclass is datetime: return t.TimestampType() elif typeclass is Decimal: return t.DecimalType(precision=36, scale=6) elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal): (precision, scale) = typeclass.__constraints__ return t.DecimalType(precision=precision, scale=scale) elif typeclass is float: return t.DoubleType() elif typeclass is int: return t.IntegerType() elif typeclass is long: return t.LongType() elif typeclass is short: return t.ShortType() elif typeclass is byte: return t.ByteType() elif getattr(typeclass, "__origin__", None) is not None: return infer_complex_spark_type(typeclass) elif is_pyspark_class(typeclass): return transform(typeclass) else: raise TypeError(f"Don't know how to represent {typeclass} in Spark")
def saveParquetDataInProjectFolder(dataframe, project_id, qualifier=1): import os ##Check if dataframe contains image struct. images = list( filter( lambda x: x[1] == 'struct<origin:string,height:int,width:int,nChannels:int,mode:int,data:binary>', dataframe.dtypes)) if len(images) > 0: for image in images: from pyspark.sql import types as T from pyspark.sql import functions as F from pyspark.ml.linalg import DenseVector folder = getProjectDataDir(project_id) + "/" + str( qualifier) + "." + image[0] import os os.makedirs(folder, exist_ok=True) def setpath(copyto): def copyImageAndTrimName(input): originSplitted = input['origin'].split("/") print(copyto) newOrigin = originSplitted[len(originSplitted) - 1] from shutil import copyfile ##Start beginning of 8, due to file:/// copyfile(input['origin'][8:], copyto + "/" + newOrigin) return { "origin": newOrigin, "height": input['height'], "width": input['width'], "nChannels": input['nChannels'], "mode": input['mode'], "data": input['data'] } return copyImageAndTrimName myFunction = setpath(folder) copyImageAndTrimNameUDF = F.udf( myFunction, T.StructType([ T.StructField("origin", T.StringType(), True), T.StructField("height", T.IntegerType(), True), T.StructField("width", T.IntegerType(), True), T.StructField("nCHannels", T.IntegerType(), True), T.StructField("mode", T.IntegerType(), True), T.StructField("data", T.BinaryType(), True) ])) dataframe = dataframe.withColumn(image[0], copyImageAndTrimNameUDF(image[0])) dataframe.write.mode("overwrite").parquet( getProjectDataDir(project_id) + "/" + str(qualifier) + ".parquet")
def hll_merge(k=12): @F.pandas_udf(T.BinaryType(), F.PandasUDFType.GROUPED_AGG) def _hll_merge(v): hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: hll.set_registers(bytearray(x)) hll_res.merge(hll) return hll_res.registers() return _hll_merge
def hll_merge(k=12): @F.pandas_udf(T.BinaryType()) def _hll_merge(v: pd.DataFrame) -> bytes: hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: hll.set_registers(bytearray(x)) hll_res.merge(hll) return hll_res.registers() return _hll_merge
def hll_init(k=12): @F.pandas_udf(T.BinaryType(), F.PandasUDFType.SCALAR) def _hll_init(v): hll = HyperLogLog(k) zero = hll.registers() def regs(x): hll.set_registers(zero); if x is not None: hll.add(str(x)); return hll.registers() return v.apply(lambda x: regs(x)) return _hll_init
def hll_init_agg(k=12): @F.pandas_udf(T.BinaryType(), F.PandasUDFType.GROUPED_AGG) def _hll_init_agg(v): hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: if isinstance(x, (bytes, bytearray)): hll.set_registers(bytearray(x)) hll_res.merge(hll) elif x is not None: hll_res.add(str(x)) return hll_res.registers() return _hll_init_agg
def hll_init_agg(k=12): @F.pandas_udf(T.BinaryType()) def _hll_init_agg(v: pd.DataFrame) -> bytes: hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: if isinstance(x, (bytes, bytearray)): hll.set_registers(bytearray(x)) hll_res.merge(hll) elif x is not None: hll_res.add(str(x)) return hll_res.registers() return _hll_init_agg
def as_spark_type(tpe) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray,): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): return types.ArrayType(as_spark_type(tpe.__args__[0])) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() else: raise TypeError("Type %s was not understood." % tpe)
def hll_init(k=12): @F.pandas_udf(T.BinaryType()) def _hll_init(v: pd.Series) -> pd.Series: hll = HyperLogLog(k) zero = hll.registers() def regs(x): hll.set_registers(zero); if x is not None: hll.add(str(x)); return hll.registers() return v.apply(lambda x: regs(x)) return _hll_init
def handle_grouped_data(data_list, window) -> T.ArrayType(T.BinaryType()): if not window or len(window) != 1: print("handle_grouped_data, should have only one window instance") return None if len(data_list) == 0: return None sample = data_list[0] sample_dict = json.loads(sample) print( f"date_list length:{len(data_list)},school: {sample_dict['school']}, major: {sample_dict['major']}, window start:{window[0].start}, window end:{window[0].end}" ) # do whatever you want to do with the date set here. # return the result list which will produce to kafka sink topic results = [sample] return results
def basic_msg_schema(): schema = types.StructType([ types.StructField('double_field', types.DoubleType()), types.StructField('float_field', types.FloatType()), types.StructField('int32_field', types.IntegerType()), types.StructField('int64_field', types.LongType()), types.StructField('uint32_field', types.IntegerType()), types.StructField('uint64_field', types.LongType()), types.StructField('sint32_field', types.IntegerType()), types.StructField('sint64_field', types.LongType()), types.StructField('fixed32_field', types.IntegerType()), types.StructField('fixed64_field', types.LongType()), types.StructField('sfixed32_field', types.IntegerType()), types.StructField('sfixed64_field', types.LongType()), types.StructField('bool_field', types.BooleanType()), types.StructField('string_field', types.StringType()), types.StructField('bytes_field', types.BinaryType()), types.StructField('enum_field', types.IntegerType()), ]) return schema
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes, ): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (decimal.Decimal, ): return types.DecimalType(38, 18) elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date, ): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray, ): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
# "attachment_names": [ # attached.longFilename for attached in msg_obj.attachments # ], "subject": msg_obj.subject, "body": msg_obj.body, } if with_attachments: msg_properties_dict["attachments"] = { attached.longFilename: attached.data for attached in msg_obj.attachments if attachment_type is None or attached.longFilename.endswith(attachment_type) } return [v for v in msg_properties_dict.values()] @udf(types.BinaryType()) def get_email_attachment(email_file, name_filter, index): """ Returns the Nth attachment that matches the specified name_filter argumnet, where N=index. Returns None if not found. """ raise NotImplementedError() return None @udf(types.BinaryType()) def extract_file_from_zip(zipfile, filename): """ Returns the compressed file `filename` from `zipfile`. """ raise NotImplementedError()
def spark_dtype(self): # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module import pyspark.sql.types as sql_types return sql_types.BinaryType()
from google.protobuf.json_format import _IsMapEntry from pyspark.sql import types from pyspark.sql.types import StructType, StructField, DataType # scalar value types mapping between ProtoBuf and Spark SQL _SPARK_SQL_TYPE_MAP = { FieldDescriptor.TYPE_DOUBLE: types.DoubleType(), FieldDescriptor.TYPE_FLOAT: types.FloatType(), FieldDescriptor.TYPE_INT64: types.LongType(), FieldDescriptor.TYPE_UINT64: types.LongType(), FieldDescriptor.TYPE_INT32: types.IntegerType(), FieldDescriptor.TYPE_FIXED64: types.LongType(), FieldDescriptor.TYPE_FIXED32: types.IntegerType(), FieldDescriptor.TYPE_BOOL: types.BooleanType(), FieldDescriptor.TYPE_STRING: types.StringType(), FieldDescriptor.TYPE_BYTES: types.BinaryType(), FieldDescriptor.TYPE_UINT32: types.IntegerType(), FieldDescriptor.TYPE_ENUM: types.IntegerType(), FieldDescriptor.TYPE_SFIXED32: types.IntegerType(), FieldDescriptor.TYPE_SFIXED64: types.LongType(), FieldDescriptor.TYPE_SINT32: types.IntegerType(), FieldDescriptor.TYPE_SINT64: types.LongType(), } def proto3_message_to_spark_dataframe(): pass def proto3_message_type_to_spark_schema( message_type: GeneratedProtocolMessageType, ) -> StructType:
# assume a.shape = (n,) raw = blosc.compress_ptr(a.__array_interface__['data'][0], a.size, a.dtype.itemsize) return {'size': a.size, 'dtype': str(a.dtype), 'data': raw} def unpack(col): a = np.empty(col['size'], dtype=col['dtype']) blosc.decompress_ptr(bytes(col['data']), a.__array_interface__['data'][0]) return a columntype = types.StructType([ types.StructField("size", types.IntegerType(), False), types.StructField("dtype", types.StringType(), False), types.StructField("data", types.BinaryType(), False), ]) columnrecordtype = types.StructType([ types.StructField("dataset", types.StringType(), False), types.StructField("chunkindex", types.IntegerType(), False), types.StructField( "columns", types.StructType([ types.StructField("derived", columntype, False), ]), False), ]) @udf(columnrecordtype) def process_chunk(chunkid, columns):
spark_check_point_dir = "checkpoint" WATERMARK_DELAY_THRESHOLD_SECONDS = 5 # 窗口水位 WINDOW_DURATION_SECONDS = 60 # 窗口大小 WINDOW_SLIDE_DURATION_SECONDS = 5 # 窗口步长 WATERMARK_DELAY_THRESHOLD = "{} seconds".format( WATERMARK_DELAY_THRESHOLD_SECONDS) WINDOW_DURATION = "{} seconds".format(WINDOW_DURATION_SECONDS) WINDOW_SLIDE_DURATION = "{} seconds".format(WINDOW_SLIDE_DURATION_SECONDS) transferSchema = T.StructType([ T.StructField("timestamp", T.TimestampType(), True), T.StructField("school", T.StringType(), True), T.StructField("major", T.StringType(), True), T.StructField("data", T.BinaryType(), True), ]) def _kafka_args(): parser = argparse.ArgumentParser(add_help=False) args = parser.add_argument_group("kafka configurations") args.add_argument( "-t", "--source_topics", help="Kafka source topics, separate by comma", default="kafka-default", ) args.add_argument("-a", "--address", help="Kafka address.",
def start_algorithm_logic(sdf: DataFrame) -> DataFrame: @F.udf(returnType=transferSchema) def transferDF(data): # 根据传入的数据来做反序列化,可以是pb 也可以是json。具体结构可根据kafka producer里生产的数据结构调整 # obj = ObjectInfo() # obj.ParseFromString(pb_bytes) # 在这里,我们将data定义为json dict. # {"timestamp": "2006-01-02Z03:04:05", "school": "cambridge", "major": "computer science", "name": "hanmeimei", "extra": ""} obj = json.loads(data) return ( datetime.strptime(obj["timestamp"], "%Y-%m-%dT%H:%M:%S.%f"), obj["school"], obj["major"], data, ) @F.udf(T.ArrayType(T.BinaryType())) def handle_grouped_data(data_list, window) -> T.ArrayType(T.BinaryType()): if not window or len(window) != 1: print("handle_grouped_data, should have only one window instance") return None if len(data_list) == 0: return None sample = data_list[0] sample_dict = json.loads(sample) print( f"date_list length:{len(data_list)},school: {sample_dict['school']}, major: {sample_dict['major']}, window start:{window[0].start}, window end:{window[0].end}" ) # do whatever you want to do with the date set here. # return the result list which will produce to kafka sink topic results = [sample] return results # 从pb数据中拆出timestamp,region_id, camera_id用来group和做窗口 sdf = sdf.select(transferDF(sdf.value).alias("window_data")).select( "window_data.timestamp", "window_data.school", "window_data.major", "window_data.data", ) # 水位设置,将滑动窗口中数据以camera_id和region_id分组 window_group = sdf.withWatermark("timestamp", WATERMARK_DELAY_THRESHOLD).groupBy( F.window(F.col("timestamp"), WINDOW_DURATION, WINDOW_SLIDE_DURATION), F.col("school"), F.col("major"), ) # 处理数据 result_df_set = window_group.agg( handle_grouped_data(F.collect_list("data"), F.collect_set("window")).alias( # 使用窗口数据 "value_set")).withColumn( "value", F.explode(F.col("value_set"))) # 拆分处理后的数据集 # 返回结果 return result_df_set.filter(result_df_set.value != b"")
from tinsel.types import NoneType, long, short, byte, decimal DEFAULT_NAME = "some_field" @struct class Dummy(NamedTuple): pass PRIMITIVES = [ (NoneType, t.NullType()), (int, t.IntegerType()), (float, t.DoubleType()), (str, t.StringType()), (bytes, t.BinaryType()), (bytearray, t.BinaryType()), (bool, t.BooleanType()), ] SYNTHETIC_PRIMITIVES = [ (long, t.LongType()), (short, t.ShortType()), (byte, t.ByteType()), ] DATE_TYPES = [ (date, t.DateType()), (datetime, t.TimestampType()), ]
[ types.StructField("double_field", types.DoubleType()), types.StructField("float_field", types.FloatType()), types.StructField("int32_field", types.IntegerType()), types.StructField("int64_field", types.LongType()), types.StructField("uint32_field", types.IntegerType()), types.StructField("uint64_field", types.LongType()), types.StructField("sint32_field", types.IntegerType()), types.StructField("sint64_field", types.LongType()), types.StructField("fixed32_field", types.IntegerType()), types.StructField("fixed64_field", types.LongType()), types.StructField("sfixed32_field", types.IntegerType()), types.StructField("sfixed64_field", types.LongType()), types.StructField("bool_field", types.BooleanType()), types.StructField("string_field", types.StringType()), types.StructField("bytes_field", types.BinaryType()), types.StructField("enum_field", types.IntegerType()), types.StructField("repeated_field", types.ArrayType(types.IntegerType())), types.StructField("nested_field", simple_msg_schema), types.StructField("repeated_nested_field", types.ArrayType(simple_msg_schema)), types.StructField( "repeated_keyvalue_field", types.ArrayType(keyvalue_msg_schema) ), types.StructField( "simple_map_field", types.MapType(types.StringType(), types.IntegerType()) ), types.StructField( "complex_map_field", types.MapType(types.StringType(), map_msg_schema) ), types.StructField("repeated_map_field", types.ArrayType(map_msg_schema)), types.StructField(
def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True, prefer_timestamp_ntz: bool = False) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ if sys.version_info >= (3, 8) and LooseVersion( np.__version__) >= LooseVersion("1.21"): if (hasattr(tpe, "__origin__") and tpe.__origin__ is np.ndarray # type: ignore[union-attr] and hasattr(tpe, "__args__") and len(tpe.__args__) > 1 # type: ignore[union-attr] ): # numpy.typing.NDArray return types.ArrayType( as_spark_type( tpe.__args__[1].__args__[0], raise_error=raise_error # type: ignore[union-attr] )) if isinstance(tpe, np.dtype) and tpe == np.dtype("object"): pass # ArrayType elif tpe in (np.ndarray, ): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass( tpe.__origin__, list # type: ignore[union-attr] ): element_type = as_spark_type( tpe.__args__[0], raise_error=raise_error # type: ignore[union-attr] ) if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool_, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date, ): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal, ): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float_, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType or TimestampNTZType if timezone is not specified. elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampNTZType( ) if prefer_timestamp_ntz else types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
def _to_stype(tpe) -> X: if _is_col(tpe): inner = as_spark_type(_get_col_inner(tpe)) return _Column(inner) inner = as_spark_type(tpe) if inner is None: return _Unknown(tpe) else: return _Scalar(inner) # First element of the list is the python base type _base = { types.StringType(): [str, 'str', 'string'], types.BinaryType(): [bytes], types.ByteType(): [np.int8, 'int8', 'byte'], types.ShortType(): [np.int16, 'int16', 'short'], types.IntegerType(): [int, 'int', np.int], types.LongType(): [np.int64, 'int64', 'long', 'bigint'], types.FloatType(): [float, 'float', np.float], types.DoubleType(): [np.float64, 'float64', 'double'], types.TimestampType(): [np.datetime64], types.DateType(): [datetime.date], types.BooleanType(): [bool, 'boolean', 'bool', np.bool], types.ArrayType(types.StringType()): [] } def _build_type_dict(): return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
return get_soxi_info_udf(audio_file_series) durations.append(duration) return pd.Series(durations) return get_soxi_info_udf get_audio_seconds_udf = _prepare_soxi_udf("-D", DoubleType(), float) get_audio_sample_rate_udf = _prepare_soxi_udf("-r", StringType(), str) get_audio_annotations_udf = _prepare_soxi_udf("-a", BinaryType(), bytes) # Can I return an array type of struct types? AUDIO_SEGMENTS_RETURN_TYPE = T.StructType( [ T.StructField("audio_name", T.ArrayType(T.StringType())), T.StructField("audio", T.ArrayType(T.BinaryType())), ] ) @F.pandas_udf(AUDIO_SEGMENTS_RETURN_TYPE) def create_audio_segments_udf( audio_bytes_series: pd.Series, audio_type_series: pd.Series, audio_name_series: pd.Series, start_ms_array_series: pd.Series, end_ms_array_series: pd.Series, output_audio_codec_series: pd.Series, ) -> pd.DataFrame: output_array = [] assert (
def as_spark_type(tpe: typing.Union[str, type, Dtype], *, raise_error: bool = True) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray, ): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): # type: ignore element_type = as_spark_type(tpe.__args__[0], raise_error=raise_error) # type: ignore if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date, ): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal, ): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
def sqlType(cls): # NB: this is actually an instance method in practice O_O ! return types.StructType([ types.StructField("np_bytes", types.BinaryType(), False) ])