def infer_spark_type(typeclass) -> t.DataType: if typeclass in (None, NoneType): return t.NullType() elif typeclass is str: return t.StringType() elif typeclass in {bytes, bytearray}: return t.BinaryType() elif typeclass is bool: return t.BooleanType() elif typeclass is date: return t.DateType() elif typeclass is datetime: return t.TimestampType() elif typeclass is Decimal: return t.DecimalType(precision=36, scale=6) elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal): (precision, scale) = typeclass.__constraints__ return t.DecimalType(precision=precision, scale=scale) elif typeclass is float: return t.DoubleType() elif typeclass is int: return t.IntegerType() elif typeclass is long: return t.LongType() elif typeclass is short: return t.ShortType() elif typeclass is byte: return t.ByteType() elif getattr(typeclass, "__origin__", None) is not None: return infer_complex_spark_type(typeclass) elif is_pyspark_class(typeclass): return transform(typeclass) else: raise TypeError(f"Don't know how to represent {typeclass} in Spark")
def infer_pd_series_spark_type( pser: pd.Series, dtype: Dtype, prefer_timestamp_ntz: bool = False) -> types.DataType: """Infer Spark DataType from pandas Series dtype. :param pser: :class:`pandas.Series` to be inferred :param dtype: the Series' dtype :param prefer_timestamp_ntz: if true, infers datetime without timezone as TimestampNTZType type. If false, infers it as TimestampType. :return: the inferred Spark data type """ if dtype == np.dtype("object"): if len(pser) == 0 or pser.isnull().all(): return types.NullType() elif hasattr(pser.iloc[0], "__UDT__"): return pser.iloc[0].__UDT__ else: return from_arrow_type( pa.Array.from_pandas(pser).type, prefer_timestamp_ntz) elif isinstance(dtype, CategoricalDtype): if isinstance(pser.dtype, CategoricalDtype): return as_spark_type(pser.cat.codes.dtype, prefer_timestamp_ntz=prefer_timestamp_ntz) else: # `pser` must already be converted to codes. return as_spark_type(pser.dtype, prefer_timestamp_ntz=prefer_timestamp_ntz) else: return as_spark_type(dtype, prefer_timestamp_ntz=prefer_timestamp_ntz)
def get_spark_data_type(input_value): return { "str": T.StringType(), "int": T.LongType(), "bool": T.BooleanType(), "float": T.DoubleType(), "NoneType": T.NullType(), }[type(input_value).__name__]
def get_type(obj): if obj is None: return T.NullType() if type(obj)==type(type): return python_type_mappings.get(obj)() if type(obj)==str: return string_type_mapping.get(obj)() raise TypeError('type ', type(obj), 'cannot be mapped')
def main(args): spark = sql.SparkSession.builder.appName('update-analyzer').getOrCreate() msg_struct = types.StructType([ types.StructField('text', types.StringType(), True), types.StructField('user_id', types.StringType(), True), types.StructField('update_id', types.StringType(), True) ]) analyzer = vader.SentimentIntensityAnalyzer() analyzer_bcast = spark.sparkContext.broadcast(analyzer) vhost_bcast = args.vhost vport_bcast = args.vport def sentiment_generator_impl(text, user_id, update_id): va = analyzer_bcast.value english = SpacyMagic.get('en_core_web_sm') result = english(text) sents = [str(sent) for sent in result.sents] sentiments = [va.polarity_scores(str(s)) for s in sents] obj = dict(user_id=user_id, update_id=update_id, text=text, sentiments=sentiments) try: con = httplib.HTTPConnection(host=vhost_bcast, port=vport_bcast) con.request('POST', '/', body=json.dumps(obj)) con.close() except Exception as e: logging.warn('unable to POST to visualizer, error:') logging.warn(e.message) sentiment_generator = functions.udf(sentiment_generator_impl, types.NullType()) records = (spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('subscribe', args.topic).load().select( functions.column('value').cast( types.StringType()).alias('value')).select( functions.from_json( functions.column('value'), msg_struct).alias('json')).select( functions.column('json.user_id'), functions.column('json.update_id'), functions.column('json.text'), sentiment_generator( functions.column('json.text'), functions.column('json.user_id'), functions.column('json.update_id'))). writeStream.format("console").start()) records.awaitTermination()
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType: """Infer Spark DataType from pandas Series dtype. :param s: :class:`pandas.Series` to be inferred :return: the inferred Spark data type """ dt = s.dtype if dt == np.dtype("object"): if len(s) == 0 or s.isnull().all(): return types.NullType() elif hasattr(s.iloc[0], "__UDT__"): return s.iloc[0].__UDT__ else: return from_arrow_type(pa.Array.from_pandas(s).type) else: return as_spark_type(dt)
def infer_pd_series_spark_type(pser: pd.Series, dtype: Dtype) -> types.DataType: """Infer Spark DataType from pandas Series dtype. :param pser: :class:`pandas.Series` to be inferred :param dtype: the Series' dtype :return: the inferred Spark data type """ if dtype == np.dtype("object"): if len(pser) == 0 or pser.isnull().all(): return types.NullType() elif hasattr(pser.iloc[0], "__UDT__"): return pser.iloc[0].__UDT__ else: return from_arrow_type(pa.Array.from_pandas(pser).type) elif isinstance(dtype, CategoricalDtype): # `pser` must already be converted to codes. return as_spark_type(pser.dtype) else: return as_spark_type(dtype)
def infer_schema(rec): """infers dataframe schema for a record. Assumes every dict is a Struct, not a Map""" if isinstance(rec, dict): return pst.StructType([ pst.StructField(key, DataWriter.infer_schema(value), True) for key, value in sorted(rec.items()) ]) elif isinstance(rec, list): if len(rec) == 0: #raise ValueError("can't infer type of an empty list") return pst.ArrayType(pst.NullType()) elem_type = DataWriter.infer_schema(rec[0]) for elem in rec: this_type = DataWriter.infer_schema(elem) if elem_type != this_type: raise ValueError( "can't infer type of a list with inconsistent elem types" ) return pst.ArrayType(elem_type) else: return pst._infer_type(rec)
import re from pyspark.sql import SparkSession, Row from pyspark.sql import functions as F from pyspark.sql import types as T # Define UDF null_negative_int = F.udf( lambda val: T.NullType() if val is None or val < 0 else val, T.IntegerType()) null_negative_double = F.udf( lambda val: T.NullType() if val is None or val < 0 else val, T.DoubleType()) to_float_list = F.udf( lambda lst: [float(x_) if is_float(x_) else None for x_ in lst], T.ArrayType(T.DoubleType()), ) is_float = lambda val: re.match(r"^-?\d+(?:\.\d+)?$", val) is not None def clean(spark, rows): # Load Data df = spark.createDataFrame(Row(**row) for row in rows) # Clean column country re_country = "[a-zA-Z][a-zA-Z\s\-]*"
]), True), t.StructField( "h", t.ArrayType( t.MapType( t.StringType(), t.StructType([ t.StructField("d", t.DoubleType(), False), t.StructField( "e", t.StructType([ t.StructField("a", t.StringType(), False), t.StructField("b", t.IntegerType(), False), t.StructField("c", t.BooleanType(), False) ]), False), t.StructField("f", t.NullType(), False) ]), True), False), False), t.StructField( "i", t.MapType( t.StructType([ t.StructField("d", t.DoubleType(), False), t.StructField( "e", t.StructType([ t.StructField("a", t.StringType(), False), t.StructField("b", t.IntegerType(), False), t.StructField("c", t.BooleanType(), False) ]), False), t.StructField("f", t.NullType(), False) ]),
import pytest from pyspark.sql import types as t from tinsel.lib import infer_spark_type, transform_field, maybe_unlift_optional, struct from tinsel.types import NoneType, long, short, byte, decimal DEFAULT_NAME = "some_field" @struct class Dummy(NamedTuple): pass PRIMITIVES = [ (NoneType, t.NullType()), (int, t.IntegerType()), (float, t.DoubleType()), (str, t.StringType()), (bytes, t.BinaryType()), (bytearray, t.BinaryType()), (bool, t.BooleanType()), ] SYNTHETIC_PRIMITIVES = [ (long, t.LongType()), (short, t.ShortType()), (byte, t.ByteType()), ] DATE_TYPES = [