Esempio n. 1
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes, ):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date, ):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in ():
        return types.ArrayType(types.StringType())
Esempio n. 2
0
def infer_spark_type(typeclass) -> t.DataType:
    if typeclass in (None, NoneType):
        return t.NullType()
    elif typeclass is str:
        return t.StringType()
    elif typeclass in {bytes, bytearray}:
        return t.BinaryType()
    elif typeclass is bool:
        return t.BooleanType()
    elif typeclass is date:
        return t.DateType()
    elif typeclass is datetime:
        return t.TimestampType()
    elif typeclass is Decimal:
        return t.DecimalType(precision=36, scale=6)
    elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal):
        (precision, scale) = typeclass.__constraints__
        return t.DecimalType(precision=precision, scale=scale)
    elif typeclass is float:
        return t.DoubleType()
    elif typeclass is int:
        return t.IntegerType()
    elif typeclass is long:
        return t.LongType()
    elif typeclass is short:
        return t.ShortType()
    elif typeclass is byte:
        return t.ByteType()
    elif getattr(typeclass, "__origin__", None) is not None:
        return infer_complex_spark_type(typeclass)
    elif is_pyspark_class(typeclass):
        return transform(typeclass)
    else:
        raise TypeError(f"Don't know how to represent {typeclass} in Spark")
Esempio n. 3
0
def saveParquetDataInProjectFolder(dataframe, project_id, qualifier=1):
    import os
    ##Check if dataframe contains image struct.

    images = list(
        filter(
            lambda x: x[1] ==
            'struct<origin:string,height:int,width:int,nChannels:int,mode:int,data:binary>',
            dataframe.dtypes))
    if len(images) > 0:
        for image in images:
            from pyspark.sql import types as T
            from pyspark.sql import functions as F
            from pyspark.ml.linalg import DenseVector
            folder = getProjectDataDir(project_id) + "/" + str(
                qualifier) + "." + image[0]
            import os
            os.makedirs(folder, exist_ok=True)

            def setpath(copyto):
                def copyImageAndTrimName(input):
                    originSplitted = input['origin'].split("/")

                    print(copyto)
                    newOrigin = originSplitted[len(originSplitted) - 1]
                    from shutil import copyfile
                    ##Start beginning of 8, due to file:///
                    copyfile(input['origin'][8:], copyto + "/" + newOrigin)
                    return {
                        "origin": newOrigin,
                        "height": input['height'],
                        "width": input['width'],
                        "nChannels": input['nChannels'],
                        "mode": input['mode'],
                        "data": input['data']
                    }

                return copyImageAndTrimName

            myFunction = setpath(folder)
            copyImageAndTrimNameUDF = F.udf(
                myFunction,
                T.StructType([
                    T.StructField("origin", T.StringType(), True),
                    T.StructField("height", T.IntegerType(), True),
                    T.StructField("width", T.IntegerType(), True),
                    T.StructField("nCHannels", T.IntegerType(), True),
                    T.StructField("mode", T.IntegerType(), True),
                    T.StructField("data", T.BinaryType(), True)
                ]))

            dataframe = dataframe.withColumn(image[0],
                                             copyImageAndTrimNameUDF(image[0]))

    dataframe.write.mode("overwrite").parquet(
        getProjectDataDir(project_id) + "/" + str(qualifier) + ".parquet")
Esempio n. 4
0
 def hll_merge(k=12):
     @F.pandas_udf(T.BinaryType(), F.PandasUDFType.GROUPED_AGG)
     def _hll_merge(v):
         hll_res = HyperLogLog(k)
         hll = HyperLogLog(k)
         for x in v:
             hll.set_registers(bytearray(x))
             hll_res.merge(hll)
         return hll_res.registers()
     return _hll_merge
Esempio n. 5
0
def hll_merge(k=12):
    @F.pandas_udf(T.BinaryType())
    def _hll_merge(v: pd.DataFrame) -> bytes:
        hll_res = HyperLogLog(k)
        hll = HyperLogLog(k)
        for x in v:
            hll.set_registers(bytearray(x))
            hll_res.merge(hll)
        return hll_res.registers()

    return _hll_merge
Esempio n. 6
0
 def hll_init(k=12):
     @F.pandas_udf(T.BinaryType(), F.PandasUDFType.SCALAR)
     def _hll_init(v):
         hll = HyperLogLog(k)
         zero = hll.registers()
         def regs(x):
             hll.set_registers(zero);
             if x is not None:
                 hll.add(str(x));
             return hll.registers()
         return v.apply(lambda x: regs(x))
     return _hll_init
Esempio n. 7
0
 def hll_init_agg(k=12):
     @F.pandas_udf(T.BinaryType(), F.PandasUDFType.GROUPED_AGG)
     def _hll_init_agg(v):
         hll_res = HyperLogLog(k)
         hll = HyperLogLog(k)
         for x in v:
             if isinstance(x, (bytes, bytearray)):
                 hll.set_registers(bytearray(x))
                 hll_res.merge(hll)
             elif x is not None:
                 hll_res.add(str(x))
         return hll_res.registers()
     return _hll_init_agg
Esempio n. 8
0
def hll_init_agg(k=12):
    @F.pandas_udf(T.BinaryType())
    def _hll_init_agg(v: pd.DataFrame) -> bytes:
        hll_res = HyperLogLog(k)
        hll = HyperLogLog(k)
        for x in v:
            if isinstance(x, (bytes, bytearray)):
                hll.set_registers(bytearray(x))
                hll_res.merge(hll)
            elif x is not None:
                hll_res.add(str(x))
        return hll_res.registers()

    return _hll_init_agg
Esempio n. 9
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray,):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
        return types.ArrayType(as_spark_type(tpe.__args__[0]))
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date,):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal,):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()
    else:
        raise TypeError("Type %s was not understood." % tpe)
Esempio n. 10
0
def hll_init(k=12):
    @F.pandas_udf(T.BinaryType())
    def _hll_init(v: pd.Series) -> pd.Series:
        hll = HyperLogLog(k)
        zero = hll.registers()

        def regs(x):
            hll.set_registers(zero);
            if x is not None:
                hll.add(str(x));
            return hll.registers()

        return v.apply(lambda x: regs(x))

    return _hll_init
    def handle_grouped_data(data_list, window) -> T.ArrayType(T.BinaryType()):
        if not window or len(window) != 1:
            print("handle_grouped_data, should have only one window instance")
            return None
        if len(data_list) == 0:
            return None
        sample = data_list[0]
        sample_dict = json.loads(sample)
        print(
            f"date_list length:{len(data_list)},school: {sample_dict['school']}, major: {sample_dict['major']}, window start:{window[0].start}, window end:{window[0].end}"
        )

        # do whatever you want to do with the date set here.

        # return the result list which will produce to kafka sink topic
        results = [sample]
        return results
Esempio n. 12
0
def basic_msg_schema():
    schema = types.StructType([
        types.StructField('double_field', types.DoubleType()),
        types.StructField('float_field', types.FloatType()),
        types.StructField('int32_field', types.IntegerType()),
        types.StructField('int64_field', types.LongType()),
        types.StructField('uint32_field', types.IntegerType()),
        types.StructField('uint64_field', types.LongType()),
        types.StructField('sint32_field', types.IntegerType()),
        types.StructField('sint64_field', types.LongType()),
        types.StructField('fixed32_field', types.IntegerType()),
        types.StructField('fixed64_field', types.LongType()),
        types.StructField('sfixed32_field', types.IntegerType()),
        types.StructField('sfixed64_field', types.LongType()),
        types.StructField('bool_field', types.BooleanType()),
        types.StructField('string_field', types.StringType()),
        types.StructField('bytes_field', types.BinaryType()),
        types.StructField('enum_field', types.IntegerType()),
    ])
    return schema
Esempio n. 13
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes, ):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (decimal.Decimal, ):
        return types.DecimalType(38, 18)
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date, ):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray, ):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe)
Esempio n. 14
0
        # "attachment_names": [
        #     attached.longFilename for attached in msg_obj.attachments
        # ],
        "subject": msg_obj.subject,
        "body": msg_obj.body,
    }
    if with_attachments:
        msg_properties_dict["attachments"] = {
            attached.longFilename: attached.data
            for attached in msg_obj.attachments if attachment_type is None
            or attached.longFilename.endswith(attachment_type)
        }
    return [v for v in msg_properties_dict.values()]


@udf(types.BinaryType())
def get_email_attachment(email_file, name_filter, index):
    """
    Returns the Nth attachment that matches the specified name_filter argumnet, where N=index.
    Returns None if not found.
    """
    raise NotImplementedError()
    return None


@udf(types.BinaryType())
def extract_file_from_zip(zipfile, filename):
    """
    Returns the compressed file `filename` from `zipfile`.
    """
    raise NotImplementedError()
Esempio n. 15
0
    def spark_dtype(self):
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
        import pyspark.sql.types as sql_types

        return sql_types.BinaryType()
Esempio n. 16
0
from google.protobuf.json_format import _IsMapEntry
from pyspark.sql import types
from pyspark.sql.types import StructType, StructField, DataType

# scalar value types mapping between ProtoBuf and Spark SQL
_SPARK_SQL_TYPE_MAP = {
    FieldDescriptor.TYPE_DOUBLE: types.DoubleType(),
    FieldDescriptor.TYPE_FLOAT: types.FloatType(),
    FieldDescriptor.TYPE_INT64: types.LongType(),
    FieldDescriptor.TYPE_UINT64: types.LongType(),
    FieldDescriptor.TYPE_INT32: types.IntegerType(),
    FieldDescriptor.TYPE_FIXED64: types.LongType(),
    FieldDescriptor.TYPE_FIXED32: types.IntegerType(),
    FieldDescriptor.TYPE_BOOL: types.BooleanType(),
    FieldDescriptor.TYPE_STRING: types.StringType(),
    FieldDescriptor.TYPE_BYTES: types.BinaryType(),
    FieldDescriptor.TYPE_UINT32: types.IntegerType(),
    FieldDescriptor.TYPE_ENUM: types.IntegerType(),
    FieldDescriptor.TYPE_SFIXED32: types.IntegerType(),
    FieldDescriptor.TYPE_SFIXED64: types.LongType(),
    FieldDescriptor.TYPE_SINT32: types.IntegerType(),
    FieldDescriptor.TYPE_SINT64: types.LongType(),
}


def proto3_message_to_spark_dataframe():
    pass


def proto3_message_type_to_spark_schema(
    message_type: GeneratedProtocolMessageType, ) -> StructType:
Esempio n. 17
0
    # assume a.shape = (n,)
    raw = blosc.compress_ptr(a.__array_interface__['data'][0], a.size,
                             a.dtype.itemsize)
    return {'size': a.size, 'dtype': str(a.dtype), 'data': raw}


def unpack(col):
    a = np.empty(col['size'], dtype=col['dtype'])
    blosc.decompress_ptr(bytes(col['data']), a.__array_interface__['data'][0])
    return a


columntype = types.StructType([
    types.StructField("size", types.IntegerType(), False),
    types.StructField("dtype", types.StringType(), False),
    types.StructField("data", types.BinaryType(), False),
])

columnrecordtype = types.StructType([
    types.StructField("dataset", types.StringType(), False),
    types.StructField("chunkindex", types.IntegerType(), False),
    types.StructField(
        "columns",
        types.StructType([
            types.StructField("derived", columntype, False),
        ]), False),
])


@udf(columnrecordtype)
def process_chunk(chunkid, columns):
spark_check_point_dir = "checkpoint"

WATERMARK_DELAY_THRESHOLD_SECONDS = 5  # 窗口水位
WINDOW_DURATION_SECONDS = 60  # 窗口大小
WINDOW_SLIDE_DURATION_SECONDS = 5  # 窗口步长
WATERMARK_DELAY_THRESHOLD = "{} seconds".format(
    WATERMARK_DELAY_THRESHOLD_SECONDS)
WINDOW_DURATION = "{} seconds".format(WINDOW_DURATION_SECONDS)
WINDOW_SLIDE_DURATION = "{} seconds".format(WINDOW_SLIDE_DURATION_SECONDS)

transferSchema = T.StructType([
    T.StructField("timestamp", T.TimestampType(), True),
    T.StructField("school", T.StringType(), True),
    T.StructField("major", T.StringType(), True),
    T.StructField("data", T.BinaryType(), True),
])


def _kafka_args():
    parser = argparse.ArgumentParser(add_help=False)
    args = parser.add_argument_group("kafka configurations")
    args.add_argument(
        "-t",
        "--source_topics",
        help="Kafka source topics, separate by comma",
        default="kafka-default",
    )
    args.add_argument("-a",
                      "--address",
                      help="Kafka address.",
def start_algorithm_logic(sdf: DataFrame) -> DataFrame:
    @F.udf(returnType=transferSchema)
    def transferDF(data):
        # 根据传入的数据来做反序列化,可以是pb 也可以是json。具体结构可根据kafka producer里生产的数据结构调整
        # obj = ObjectInfo()
        # obj.ParseFromString(pb_bytes)
        # 在这里,我们将data定义为json dict.
        # {"timestamp": "2006-01-02Z03:04:05", "school": "cambridge", "major": "computer science", "name": "hanmeimei", "extra": ""}
        obj = json.loads(data)
        return (
            datetime.strptime(obj["timestamp"], "%Y-%m-%dT%H:%M:%S.%f"),
            obj["school"],
            obj["major"],
            data,
        )

    @F.udf(T.ArrayType(T.BinaryType()))
    def handle_grouped_data(data_list, window) -> T.ArrayType(T.BinaryType()):
        if not window or len(window) != 1:
            print("handle_grouped_data, should have only one window instance")
            return None
        if len(data_list) == 0:
            return None
        sample = data_list[0]
        sample_dict = json.loads(sample)
        print(
            f"date_list length:{len(data_list)},school: {sample_dict['school']}, major: {sample_dict['major']}, window start:{window[0].start}, window end:{window[0].end}"
        )

        # do whatever you want to do with the date set here.

        # return the result list which will produce to kafka sink topic
        results = [sample]
        return results

    # 从pb数据中拆出timestamp,region_id, camera_id用来group和做窗口
    sdf = sdf.select(transferDF(sdf.value).alias("window_data")).select(
        "window_data.timestamp",
        "window_data.school",
        "window_data.major",
        "window_data.data",
    )

    # 水位设置,将滑动窗口中数据以camera_id和region_id分组
    window_group = sdf.withWatermark("timestamp",
                                     WATERMARK_DELAY_THRESHOLD).groupBy(
                                         F.window(F.col("timestamp"),
                                                  WINDOW_DURATION,
                                                  WINDOW_SLIDE_DURATION),
                                         F.col("school"),
                                         F.col("major"),
                                     )
    # 处理数据
    result_df_set = window_group.agg(
        handle_grouped_data(F.collect_list("data"),
                            F.collect_set("window")).alias(  # 使用窗口数据
                                "value_set")).withColumn(
                                    "value",
                                    F.explode(F.col("value_set")))  # 拆分处理后的数据集

    # 返回结果
    return result_df_set.filter(result_df_set.value != b"")
Esempio n. 20
0
from tinsel.types import NoneType, long, short, byte, decimal

DEFAULT_NAME = "some_field"


@struct
class Dummy(NamedTuple):
    pass


PRIMITIVES = [
    (NoneType, t.NullType()),
    (int, t.IntegerType()),
    (float, t.DoubleType()),
    (str, t.StringType()),
    (bytes, t.BinaryType()),
    (bytearray, t.BinaryType()),
    (bool, t.BooleanType()),
]

SYNTHETIC_PRIMITIVES = [
    (long, t.LongType()),
    (short, t.ShortType()),
    (byte, t.ByteType()),
]

DATE_TYPES = [
    (date, t.DateType()),
    (datetime, t.TimestampType()),
]
Esempio n. 21
0
 [
     types.StructField("double_field", types.DoubleType()),
     types.StructField("float_field", types.FloatType()),
     types.StructField("int32_field", types.IntegerType()),
     types.StructField("int64_field", types.LongType()),
     types.StructField("uint32_field", types.IntegerType()),
     types.StructField("uint64_field", types.LongType()),
     types.StructField("sint32_field", types.IntegerType()),
     types.StructField("sint64_field", types.LongType()),
     types.StructField("fixed32_field", types.IntegerType()),
     types.StructField("fixed64_field", types.LongType()),
     types.StructField("sfixed32_field", types.IntegerType()),
     types.StructField("sfixed64_field", types.LongType()),
     types.StructField("bool_field", types.BooleanType()),
     types.StructField("string_field", types.StringType()),
     types.StructField("bytes_field", types.BinaryType()),
     types.StructField("enum_field", types.IntegerType()),
     types.StructField("repeated_field", types.ArrayType(types.IntegerType())),
     types.StructField("nested_field", simple_msg_schema),
     types.StructField("repeated_nested_field", types.ArrayType(simple_msg_schema)),
     types.StructField(
         "repeated_keyvalue_field", types.ArrayType(keyvalue_msg_schema)
     ),
     types.StructField(
         "simple_map_field", types.MapType(types.StringType(), types.IntegerType())
     ),
     types.StructField(
         "complex_map_field", types.MapType(types.StringType(), map_msg_schema)
     ),
     types.StructField("repeated_map_field", types.ArrayType(map_msg_schema)),
     types.StructField(
Esempio n. 22
0
def as_spark_type(tpe: Union[str, type, Dtype],
                  *,
                  raise_error: bool = True,
                  prefer_timestamp_ntz: bool = False) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
    if sys.version_info >= (3, 8) and LooseVersion(
            np.__version__) >= LooseVersion("1.21"):
        if (hasattr(tpe, "__origin__")
                and tpe.__origin__ is np.ndarray  # type: ignore[union-attr]
                and hasattr(tpe, "__args__")
                and len(tpe.__args__) > 1  # type: ignore[union-attr]
            ):
            # numpy.typing.NDArray
            return types.ArrayType(
                as_spark_type(
                    tpe.__args__[1].__args__[0],
                    raise_error=raise_error  # type: ignore[union-attr]
                ))

    if isinstance(tpe, np.dtype) and tpe == np.dtype("object"):
        pass
    # ArrayType
    elif tpe in (np.ndarray, ):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(
            tpe.__origin__,
            list  # type: ignore[union-attr]
    ):
        element_type = as_spark_type(
            tpe.__args__[0],
            raise_error=raise_error  # type: ignore[union-attr]
        )
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool_, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date, ):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal, ):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType or TimestampNTZType if timezone is not specified.
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampNTZType(
        ) if prefer_timestamp_ntz else types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str)
                                               and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str)
                                          and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str)
                                                 and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str)
                                                  and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str)
                                                 and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str)
                                                   and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
Esempio n. 23
0
def _to_stype(tpe) -> X:
    if _is_col(tpe):
        inner = as_spark_type(_get_col_inner(tpe))
        return _Column(inner)
    inner = as_spark_type(tpe)
    if inner is None:
        return _Unknown(tpe)
    else:
        return _Scalar(inner)


# First element of the list is the python base type
_base = {
    types.StringType(): [str, 'str', 'string'],
    types.BinaryType(): [bytes],
    types.ByteType(): [np.int8, 'int8', 'byte'],
    types.ShortType(): [np.int16, 'int16', 'short'],
    types.IntegerType(): [int, 'int', np.int],
    types.LongType(): [np.int64, 'int64', 'long', 'bigint'],
    types.FloatType(): [float, 'float', np.float],
    types.DoubleType(): [np.float64, 'float64', 'double'],
    types.TimestampType(): [np.datetime64],
    types.DateType(): [datetime.date],
    types.BooleanType(): [bool, 'boolean', 'bool', np.bool],
    types.ArrayType(types.StringType()): []
}


def _build_type_dict():
    return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
Esempio n. 24
0
                    return get_soxi_info_udf(audio_file_series)
                durations.append(duration)
            return pd.Series(durations)

    return get_soxi_info_udf


get_audio_seconds_udf = _prepare_soxi_udf("-D", DoubleType(), float)
get_audio_sample_rate_udf = _prepare_soxi_udf("-r", StringType(), str)
get_audio_annotations_udf = _prepare_soxi_udf("-a", BinaryType(), bytes)

# Can I return an array type of struct types?
AUDIO_SEGMENTS_RETURN_TYPE = T.StructType(
    [
        T.StructField("audio_name", T.ArrayType(T.StringType())),
        T.StructField("audio", T.ArrayType(T.BinaryType())),
    ]
)


@F.pandas_udf(AUDIO_SEGMENTS_RETURN_TYPE)
def create_audio_segments_udf(
    audio_bytes_series: pd.Series,
    audio_type_series: pd.Series,
    audio_name_series: pd.Series,
    start_ms_array_series: pd.Series,
    end_ms_array_series: pd.Series,
    output_audio_codec_series: pd.Series,
) -> pd.DataFrame:
    output_array = []
    assert (
Esempio n. 25
0
def as_spark_type(tpe: typing.Union[str, type, Dtype],
                  *,
                  raise_error: bool = True) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray, ):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__,
                                                   list):  # type: ignore
        element_type = as_spark_type(tpe.__args__[0],
                                     raise_error=raise_error)  # type: ignore
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date, ):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal, ):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str)
                                               and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str)
                                          and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str)
                                                 and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str)
                                                  and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str)
                                                 and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str)
                                                   and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
Esempio n. 26
0
 def sqlType(cls):
   # NB: this is actually an instance method in practice O_O !
   return types.StructType([
     types.StructField("np_bytes", types.BinaryType(), False)
   ])