def schema(self): ''' Return the data type that represents a row from the received data list. ''' from pyspark.sql.types import (IntegerType, LongType, ShortType, StringType, StructField, StructType) return StructType( [ StructField('frame_time', StringType(), True), StructField('unix_tstamp', LongType(), True), StructField('frame_len', IntegerType(), True), StructField('ip_dst', StringType(), True), StructField('ip_src', StringType(), True), StructField('dns_qry_name', StringType(), True), StructField('dns_qry_class', StringType(), True), StructField('dns_qry_type', IntegerType(), True), StructField('dns_qry_rcode', IntegerType(), True), StructField('dns_a', StringType(), True), StructField('y', ShortType(), True), StructField('m', ShortType(), True), StructField('d', ShortType(), True), StructField('h', ShortType(), True) ] )
def test_apply_schema(self): from datetime import date, datetime rdd = self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), {"a": 1}, (2,), [1, 2, 3], None)]) schema = StructType([ StructField("byte1", ByteType(), False), StructField("byte2", ByteType(), False), StructField("short1", ShortType(), False), StructField("short2", ShortType(), False), StructField("int1", IntegerType(), False), StructField("float1", FloatType(), False), StructField("date1", DateType(), False), StructField("time1", TimestampType(), False), StructField("map1", MapType(StringType(), IntegerType(), False), False), StructField("struct1", StructType([StructField("b", ShortType(), False)]), False), StructField("list1", ArrayType(ByteType(), False), False), StructField("null1", DoubleType(), True)]) df = self.spark.createDataFrame(rdd, schema) results = df.rdd.map(lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1, x.date1, x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1)) r = (127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None) self.assertEqual(r, results.first()) with self.tempView("table2"): df.createOrReplaceTempView("table2") r = self.spark.sql("SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " + "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " + "float1 + 1.5 as float1 FROM table2").first() self.assertEqual((126, -127, -32767, 32766, 2147483646, 2.5), tuple(r))
def load_df_with_schema(spark): schema = StructType([ StructField("dateCrawled", TimestampType(), True), StructField("name", StringType(), True), StructField("seller", StringType(), False), StructField("offerType", StringType(), True), StructField("price", LongType(), True), StructField("abtest", StringType(), True), StructField("vehicleType", StringType(), True), StructField("yearOfRegistration", StringType(), True), StructField("gearbox", StringType(), True), StructField("powerPS", ShortType(), True), StructField("model", StringType(), True), StructField("kilometer", LongType(), True), StructField("monthOfRegistration", StringType(), True), StructField("fuelType", StringType(), True), StructField("brand", StringType(), True), StructField("notRepairedDamage", StringType(), True), StructField("dateCreated", DateType(), True), StructField("nrOfPictures", ShortType(), True), StructField("postalCode", StringType(), True), StructField("lastSeen", TimestampType(), True) ]) df = spark \ .read \ .format("csv") \ .schema(schema) \ .option("header", "true") \ .load("/home/gavaskarrathnam/dataeng/etl-analytics-pyspark/data/autos.csv") print("Data loaded into PySpark", "\n") return df
def setUpClass(cls): cls._TestField1a = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False) cls._TestField1b = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False) cls._TestField1c = UnischemaField('Random', np.string_, (), ScalarCodec(StringType()), False) cls._TestField2a = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False) cls._TestField2b = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False) cls._TestField2c = UnischemaField('ID', np.int32, (), ScalarCodec(ShortType()), False)
def test_make_named_tuple(): TestSchema = Unischema('TestSchema', [ UnischemaField('string_scalar', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('int32_scalar', np.int32, (), ScalarCodec(ShortType()), False), UnischemaField('uint8_scalar', np.uint8, (), ScalarCodec(ShortType()), False), UnischemaField('int32_matrix', np.float32, (10, 20, 3), NdarrayCodec(), True), UnischemaField('decimal_scalar', Decimal, (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False), ]) TestSchema.make_namedtuple(string_scalar='abc', int32_scalar=10, uint8_scalar=20, int32_matrix=np.int32((10, 20, 3)), decimal_scalar=Decimal(123) / Decimal(10)) TestSchema.make_namedtuple(string_scalar=None, int32_scalar=10, uint8_scalar=20, int32_matrix=None, decimal_scalar=Decimal(123) / Decimal(10))
def test_assert_schema_equal(spark): expected = StructType([ StructField('Name', StringType(), True), StructField('Age', ShortType(), True) ]) result = StructType([ StructField('Name', StringType(), True), StructField('Age', ShortType(), True) ]) assert_schema_equal(expected, result) result = StructType([ StructField('Name', StringType(), True), StructField('Age', IntegerType(), True) ]) with pytest.raises(AssertionError): assert_schema_equal(expected, result) result = StructType([ StructField('Name', StringType(), True), StructField('age', ShortType(), True) ]) with pytest.raises(AssertionError): assert_schema_equal(expected, result) result = StructType([ StructField('Name', StringType(), True), StructField('Age', ShortType(), False) ]) with pytest.raises(AssertionError): assert_schema_equal(expected, result)
def loadDFWithSchema(spark): schema = StructType([ StructField('dateCrawled', TimestampType(), True), StructField('name', StringType(), True), StructField('seller', StringType(), True), StructField('offerType', StringType(), True), StructField('price', LongType(), True), StructField('abtest', StringType(), True), StructField('vehicleType', StringType(), True), StructField('yearOfRegistration', StringType(), True), StructField('gearbox', StringType(), True), StructField('powerPS', ShortType(), True), StructField('model', StringType(), True), StructField('kilometer', LongType(), True), StructField('monthOfRegistration', StringType(), True), StructField('fuelType', StringType(), True), StructField('brand', StringType(), True), StructField('notRepairedDamage', StringType(), True), StructField('dateCreated', DateType(), True), StructField('nrOfPictures', ShortType(), True), StructField('postalCode', StringType(), True), StructField('lastSeen', TimestampType(), True) ]) df = spark \ .read \ .format('csv') \ .schema(schema) \ .option('header', 'true') \ .load(DATASET) print('Data loaded into PySpark', '\n') # Returning the DataFrame return df
class Fixture(object): string1 = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False) string2 = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False) string_implicit = UnischemaField('random', np.string_, ()) string_nullable = UnischemaField('random', np.string_, (), nullable=True) other_string = UnischemaField('Random', np.string_, (), ScalarCodec(StringType()), False) int1 = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False) int2 = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False) other_int = UnischemaField('ID', np.int32, (), ScalarCodec(ShortType()), False)
class Fixture(object): TestField1a = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False) TestField1b = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False) TestField1c = UnischemaField('Random', np.string_, (), ScalarCodec(StringType()), False) TestField2a = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False) TestField2b = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False) TestField2c = UnischemaField('ID', np.int32, (), ScalarCodec(ShortType()), False)
def test_insert_explicit_nulls(self): TestSchema = Unischema('TestSchema', [ UnischemaField('nullable', np.int32, (), ScalarCodec(StringType()), True), UnischemaField('not_nullable', np.int32, (), ScalarCodec(ShortType()), False), ]) # Insert_explicit_nulls to leave the dictionary as is. row_dict = {'nullable': 0, 'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) self.assertEqual(len(row_dict), 2) self.assertEqual(row_dict['nullable'], 0) self.assertEqual(row_dict['not_nullable'], 1) # Insert_explicit_nulls to leave the dictionary as is. row_dict = {'nullable': None, 'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) self.assertEqual(len(row_dict), 2) self.assertEqual(row_dict['nullable'], None) self.assertEqual(row_dict['not_nullable'], 1) # We are missing a nullable field here. insert_explicit_nulls should add a None entry. row_dict = {'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) self.assertEqual(len(row_dict), 2) self.assertEqual(row_dict['nullable'], None) self.assertEqual(row_dict['not_nullable'], 1) # We are missing a not_nullable field here. Should raise an ValueError. row_dict = {'nullable': 0} with self.assertRaises(ValueError): insert_explicit_nulls(TestSchema, row_dict)
def test_primitives(): assert BooleanType() == parse_schema("bool") assert BooleanType() == parse_schema("boolean") assert ByteType() == parse_schema("byte") assert ByteType() == parse_schema("tinyint") assert ShortType() == parse_schema("short") assert ShortType() == parse_schema("smallint") assert IntegerType() == parse_schema("int") assert FloatType() == parse_schema("float") assert DoubleType() == parse_schema("double") assert StringType() == parse_schema("string") assert BinaryType() == parse_schema("binary")
def test_insert_explicit_nulls(): TestSchema = Unischema('TestSchema', [ UnischemaField('nullable', np.int32, (), ScalarCodec(StringType()), True), UnischemaField('not_nullable', np.int32, (), ScalarCodec(ShortType()), False), ]) # Insert_explicit_nulls to leave the dictionary as is. row_dict = {'nullable': 0, 'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) assert len(row_dict) == 2 assert row_dict['nullable'] == 0 assert row_dict['not_nullable'] == 1 # Insert_explicit_nulls to leave the dictionary as is. row_dict = {'nullable': None, 'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) assert len(row_dict) == 2 assert row_dict['nullable'] is None assert row_dict['not_nullable'] == 1 # We are missing a nullable field here. insert_explicit_nulls should add a None entry. row_dict = {'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) assert len(row_dict) == 2 assert row_dict['nullable'] is None assert row_dict['not_nullable'] == 1 # We are missing a not_nullable field here. Should raise an ValueError. row_dict = {'nullable': 0} with pytest.raises(ValueError): insert_explicit_nulls(TestSchema, row_dict)
def read_song_data(spark, song_files): """Read song data to spark DataFrame. Args: spark (obj): SparkSession song_files (list of str): list of full paths to song files on S3 Returns: song_df (obj): DataFrame with song data """ song_df_schema = StructType([ StructField('artist_id', StringType()), StructField('artist_latitude', FloatType()), StructField('artist_longitude', FloatType()), StructField('artist_location', StringType()), StructField('artist_name', StringType()), StructField('song_id', StringType()), StructField('title', StringType()), StructField('duration', FloatType()), StructField('year', ShortType()) ]) # read song data files to DataFrame song_df = spark.read.json(song_files, schema=song_df_schema, mode='DROPMALFORMED') return song_df
def iff_schema(self): from pyspark.sql.types import (ShortType, StringType, StructType,StructField,LongType, IntegerType, DoubleType) myschema = StructType([ StructField("recType", ShortType(), True), # 1 //track point record type number StructField("recTime", StringType(), True), # 2 //seconds since midnigght 1/1/70 UTC StructField("fltKey", LongType(), True), # 3 //flight key StructField("bcnCode", IntegerType(), True), # 4 //digit range from 0 to 7 StructField("cid", IntegerType(), True), # 5 //computer flight id StructField("Source", StringType(), True), # 6 //source of the record StructField("msgType", StringType(), True), # 7 StructField("callsign", StringType(), True), # 8 //call sign StructField("recTypeCat", IntegerType(), True), # 9 StructField("latitude", DoubleType(), True), # 10 StructField("longitude", DoubleType(), True), # 11 StructField("altitude", DoubleType(), True), # 12 //in 100s of feet StructField("significance", ShortType(), True), # 13 //digit range from 1 to 10 StructField("latAcc", DoubleType(), True), # 14 StructField("lonAcc", DoubleType(), True), # 15 StructField("altAcc", DoubleType(), True), # 16 StructField("tas", IntegerType(), True), # 17 //in knots StructField("heading", DoubleType(), True), # 18 //in degrees from true north StructField("rateOfClimb", DoubleType(), True), # 19 //in feet per minute StructField("altQualifier", StringType(), True), # 20 //Altitude qualifier (the “B4 character”) StructField("altIndicator", StringType(), True), # 21 //Altitude indicator (the “C4 character”) StructField("trackPtStatus", StringType(), True), # 22 //Track point status (e.g., ‘C’ for coast) StructField("leaderDir", IntegerType(), True), # 23 //int 0-8 representing the direction of the leader line StructField("scratchPad", StringType(), True), # 24 StructField("msawInhibitInd", ShortType(), True), # 25 // MSAW Inhibit Indicator (0=not inhibited, 1=inhibited) StructField("assignedAltString", StringType(), True), # 26 StructField("controllingFac", StringType(), True), # 27 StructField("controllingSec", StringType(), True), # 28 StructField("receivingFac", StringType(), True), # 29 StructField("receivingSec", StringType(), True), # 30 StructField("activeContr", IntegerType(), True), # 31 // the active control number StructField("primaryContr", IntegerType(), True), # 32 //The primary(previous, controlling, or possible next)controller number StructField("kybrdSubset", StringType(), True), # 33 //identifies a subset of controller keyboards StructField("kybrdSymbol", StringType(), True), # 34 //identifies a keyboard within the keyboard subsets StructField("adsCode", IntegerType(), True), # 35 //arrival departure status code StructField("opsType", StringType(), True), # 36 //Operations type (O/E/A/D/I/U)from ARTS and ARTS 3A data StructField("airportCode", StringType(), True), # 37 StructField("trackNumber", IntegerType(), True), # 38 StructField("tptReturnType", StringType(), True), # 39 StructField("modeSCode", StringType(), True) # 40 ]) return myschema
def check_column_numeric(df, column): return df.schema[column].dataType in [ IntegerType(), ShortType(), LongType(), FloatType(), DecimalType(), DoubleType() ]
def from_arrow_schema(cls, parquet_dataset): """ Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will throw an exception. :param arrow_schema: :class:`pyarrow.lib.Schema` :return: A :class:`Unischema` object. """ meta = parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open) arrow_schema = meta.schema.to_arrow_schema() unischema_fields = [] for partition_name in parquet_dataset.partitions.partition_names: unischema_fields.append(UnischemaField(partition_name, np.str_, (), ScalarCodec(StringType()), False)) for column_name in arrow_schema.names: arrow_field = arrow_schema.field_by_name(column_name) field_type = arrow_field.type if field_type == pyarrow.int8(): np_type = np.int8 codec = ScalarCodec(ByteType()) elif field_type == pyarrow.int16(): np_type = np.int16 codec = ScalarCodec(ShortType()) elif field_type == pyarrow.int32(): np_type = np.int32 codec = ScalarCodec(IntegerType()) elif field_type == pyarrow.int64(): np_type = np.int64 codec = ScalarCodec(LongType()) elif field_type == pyarrow.string(): np_type = np.unicode_ codec = ScalarCodec(StringType()) elif field_type == pyarrow.bool_(): np_type = np.bool_ codec = ScalarCodec(BooleanType()) elif field_type == pyarrow.float32(): np_type = np.float32 codec = ScalarCodec(FloatType()) elif field_type == pyarrow.float64(): np_type = np.float64 codec = ScalarCodec(DoubleType()) elif isinstance(field_type, pyarrow.lib.Decimal128Type): np_type = Decimal codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale)) elif field_type == pyarrow.binary(): np_type = np.string_ codec = ScalarCodec(StringType()) elif isinstance(field_type, pyarrow.lib.FixedSizeBinaryType): np_type = np.string_ codec = ScalarCodec(StringType()) else: raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type)) unischema_fields.append(UnischemaField(column_name, np_type, (), codec, arrow_field.nullable)) return Unischema('inferred_schema', unischema_fields)
def test_primitive(test_ctx): schema = StructType([ StructField("bool_col", BooleanType(), False), StructField("float_col", FloatType(), False), StructField("double_col", DoubleType(), False), StructField("short_col", ShortType(), False), StructField("int_col", IntegerType(), False), StructField("long_col", LongType(), False), StructField("str_col", StringType(), False), StructField("bin_col", BinaryType(), False), StructField("byte_col", ByteType(), False), ]) df = test_ctx.spark.createDataFrame( [(True, 0.12, 432.1, 5, 5, 0, "hello", bytearray(b"spark\x01\x02"), -128), (False, 123.45, 0.987, 9, 908, 765, "petastorm", bytearray(b"\x0012345"), 127)], schema=schema).coalesce(1) # If we use numPartition > 1, the order of the loaded dataset would # be non-deterministic. expected_df = df.collect() converter = make_spark_converter(df) with converter.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) # TODO: we will improve the test once the batch_size argument # added. # Now we only have one batch. for i in range(converter.dataset_size): for col in df.schema.names: actual_ele = getattr(ts, col)[i] expected_ele = expected_df[i][col] if col == "str_col": actual_ele = actual_ele.decode() if col == "bin_col": actual_ele = bytearray(actual_ele) if col == "float_col" or col == "double_col": # Note that the default dtype is float32 assert pytest.approx(expected_ele, rel=1e-6) == actual_ele else: assert expected_ele == actual_ele assert len(expected_df) == len(converter) assert np.bool_ == ts.bool_col.dtype.type assert np.float32 == ts.float_col.dtype.type # Default dtype float32 assert np.float32 == ts.double_col.dtype.type assert np.int16 == ts.short_col.dtype.type assert np.int32 == ts.int_col.dtype.type assert np.int64 == ts.long_col.dtype.type assert np.object_ == ts.str_col.dtype.type assert np.object_ == ts.bin_col.dtype.type
def enrich_puzzle_data(puzzles: DataFrame) -> DataFrame: """Merge puzzle data with corresponding puzzles. :param puzzles: PySpark DataFrame containing un-enriched puzzle data. :return: PySpark DataFrame containing metadata-enriched puzzles. """ data_folder = Path('resources') meta = data_folder / 'puzzle_meta.json' f_enrichment = open(meta) enrichment_data = json.load(f_enrichment) enrichment_schema = StructType([ StructField('puzzle_no', ShortType(), False), StructField('final_jumble_groupings', ArrayType(ShortType()), False) ]) enriched_puzzles = spark.createDataFrame(enrichment_data['puzzles'], enrichment_schema) return puzzles.join(other=enriched_puzzles, how='left', on='puzzle_no')
def test_assert_frame_equal(spark): expected = create_dataframe({ 'Name': ['Tom', 'Charlie'], 'Age': [25, 24] }, spark=spark) result = create_dataframe({ 'Name': ['Tom', 'Charlie'], 'Age': [25, 24] }, spark=spark) assert_dataframe_equal(expected, result) result = create_dataframe({ 'Name': ['Tom', 'Charlie'], 'Age': [25, 23] }, spark=spark) with pytest.raises(AssertionError): assert_dataframe_equal(expected, result) result = create_dataframe({'Name': ['Tom'], 'Age': [25]}, spark=spark) with pytest.raises(AssertionError): assert_dataframe_equal(expected, result) new_schema = StructType([ StructField('Name', StringType(), True), StructField('Age', ShortType(), True), ]) result = create_dataframe({ 'Name': ['Tom', 'Charlie'], 'Age': [25, 24] }, schema=new_schema, spark=spark) with pytest.raises(AssertionError): assert_dataframe_equal(expected, result) assert_dataframe_equal(expected, result, check_schema=False) result = create_dataframe({ 'Name': ['Charlie', 'Tom'], 'Age': [24, 25] }, spark=spark) with pytest.raises(AssertionError): assert_dataframe_equal(expected, result) assert_dataframe_equal(expected, result, check_order=False)
def schema(self): ''' Return the data type that represents a row from the received data list. ''' from pyspark.sql.types import (FloatType, IntegerType, LongType, ShortType, StringType, StructField, StructType) return StructType([ StructField('treceived', StringType(), True), StructField('unix_tstamp', LongType(), True), StructField('tryear', IntegerType(), True), StructField('trmonth', IntegerType(), True), StructField('trday', IntegerType(), True), StructField('trhour', IntegerType(), True), StructField('trminute', IntegerType(), True), StructField('trsecond', IntegerType(), True), StructField('tdur', FloatType(), True), StructField('sip', StringType(), True), StructField('dip', StringType(), True), StructField('sport', IntegerType(), True), StructField('dport', IntegerType(), True), StructField('proto', StringType(), True), StructField('flag', StringType(), True), StructField('fwd', IntegerType(), True), StructField('stos', IntegerType(), True), StructField('ipkt', LongType(), True), StructField('ibyt', LongType(), True), StructField('opkt', LongType(), True), StructField('obyt', LongType(), True), StructField('input', IntegerType(), True), StructField('output', IntegerType(), True), StructField('sas', IntegerType(), True), StructField('das', IntegerType(), True), StructField('dtos', IntegerType(), True), StructField('dir', IntegerType(), True), StructField('rip', StringType(), True), StructField('y', ShortType(), True), StructField('m', ShortType(), True), StructField('d', ShortType(), True), StructField('h', ShortType(), True) ])
def load_puzzles() -> DataFrame: """Automated puzzle loading. :return: PySpark DataFrame of puzzles and information about them. """ data_folder = Path('resources') puzzle_file = data_folder / 'puzzles.json' f_puzzle = open(puzzle_file) puzzle_data = json.load(f_puzzle) sort_word = udf(lambda j: ''.join(sorted(j))) puzzle_schema = StructType([ StructField('puzzle_no', ShortType(), False), StructField('scrambled_word', StringType(), False), StructField('key_indices', ArrayType(ShortType()), False) ]) puzzles = spark.createDataFrame(puzzle_data['jumbles'], puzzle_schema) \ .withColumn('sorted_word', sort_word(col('scrambled_word'))) return enrich_puzzle_data(puzzles)
def struct_type(): ''' Return the data type that represents a row from the received data list. ''' from pyspark.sql.types import (StructType, StructField, StringType, ShortType, IntegerType, FloatType) return StructType([ StructField('data', StringType(), True), StructField('event_id', IntegerType(), True), StructField('event_second', IntegerType(), True), StructField('length', IntegerType(), True), StructField('linktype', ShortType(), True), StructField('sensor_id', IntegerType(), True), StructField('unix_tstamp', FloatType(), True), StructField('y', ShortType(), True), StructField('m', ShortType(), True), StructField('d', ShortType(), True), StructField('h', ShortType(), True) ])
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def test_as_spark_type_extension_dtypes(self): from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype type_mapper = { Int8Dtype(): ByteType(), Int16Dtype(): ShortType(), Int32Dtype(): IntegerType(), Int64Dtype(): LongType(), } for extension_dtype, spark_type in type_mapper.items(): self.assertEqual(as_spark_type(extension_dtype), spark_type)
def sqlType(cls): """ Mirrors `schema` in scala companion object org.apache.spark.sql.rf.TileUDT """ return StructType([ StructField( "cell_context", StructType([ StructField( "cellType", StructType( [StructField("cellTypeName", StringType(), False)]), False), StructField( "dimensions", StructType([ StructField("cols", ShortType(), False), StructField("rows", ShortType(), False) ]), False), ]), False), StructField( "cell_data", StructType([ StructField("cells", BinaryType(), True), StructField( "ref", StructType([ StructField("source", RasterSourceUDT(), False), StructField("bandIndex", IntegerType(), False), StructField( "subextent", StructType([ StructField("xmin", DoubleType(), False), StructField("ymin", DoubleType(), False), StructField("xmax", DoubleType(), False), StructField("ymax", DoubleType(), False) ]), True) ]), True) ]), False) ])
def struct_type(): ''' Return the data type that represents a row from the received data list. ''' from pyspark.sql.types import (StructType, StructField, StringType, ShortType, IntegerType, LongType, FloatType) return StructType([ StructField('blocked', ShortType(), True), StructField('classification', StringType(), True), StructField('classification_id', IntegerType(), True), StructField('destination_ip', StringType(), True), StructField('dport_icode', IntegerType(), True), StructField('event_id', IntegerType(), True), StructField('generator_id', IntegerType(), True), StructField('impact', IntegerType(), True), StructField('impact_flag', ShortType(), True), StructField('priority', IntegerType(), True), StructField('protocol', IntegerType(), True), StructField('sensor_id', IntegerType(), True), StructField('signature_id', LongType(), True), StructField('signature_revision', IntegerType(), True), StructField('sport_itype', LongType(), True), StructField('source_ip', StringType(), True), StructField('vlan_id', IntegerType(), True), StructField('unix_tstamp', FloatType(), True), StructField('y', ShortType(), True), StructField('m', ShortType(), True), StructField('d', ShortType(), True), StructField('h', ShortType(), True) ])
def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 codec = ScalarCodec(ByteType()) elif types.is_int16(field_type): np_type = np.int16 codec = ScalarCodec(ShortType()) elif types.is_int32(field_type): np_type = np.int32 codec = ScalarCodec(IntegerType()) elif types.is_int64(field_type): np_type = np.int64 codec = ScalarCodec(LongType()) elif types.is_string(field_type): np_type = np.unicode_ codec = ScalarCodec(StringType()) elif types.is_boolean(field_type): np_type = np.bool_ codec = ScalarCodec(BooleanType()) elif types.is_float32(field_type): np_type = np.float32 codec = ScalarCodec(FloatType()) elif types.is_float64(field_type): np_type = np.float64 codec = ScalarCodec(DoubleType()) elif types.is_decimal(field_type): np_type = Decimal codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale)) elif types.is_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_fixed_size_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 codec = ScalarCodec(DateType()) elif types.is_timestamp(field_type): np_type = np.datetime64 codec = ScalarCodec(TimestampType()) elif types.is_list(field_type): _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) codec = None else: raise ValueError( 'Cannot auto-create unischema due to unsupported column type {}'. format(field_type)) return codec, np_type
def test_invalid_schema_field(synthetic_dataset): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError) as e: Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1)) assert 'bogus_key' in str(e)
def map_precision_col_df(df: pyspark.sql.dataframe.DataFrame, col_name: str) -> pyspark.sql.types.DataType: if isinstance(df.schema[col_name].dataType, DecimalType) is True: data_precision = df.schema[col_name].dataType.precision if data_precision <= 2: return ByteType() elif 2 < data_precision <= 4: return ShortType() elif 4 < data_precision <= 9: return IntegerType() else: return LongType() return StringType()
def make_songplay_data(d_artist_df, d_song_df, event_df): """ Create the songplay fact dataframe. Parameters: d_artist_df (DataFrame): The artist dimension dataframe. d_song_df (DataFrame): The song dimension dataframe. event_df (DataFrame): The raw song play event dataframe. Returns: f_songplay_df (DataFrame): A songplay fact dataframe. """ print('\nmake_songplay_data...') tmp_df = d_song_df.withColumnRenamed('artist_id', 'song_artist_id') tmp_df = tmp_df.join(d_artist_df, d_artist_df.artist_id == tmp_df.song_artist_id) \ .select('song_id', 'title', 'duration', 'artist_id', 'artist_name') comparison = [ event_df.song == tmp_df.title, event_df.length.cast(ShortType()) == tmp_df.duration.cast(ShortType()) ] # extract columns from joined song and log datasets to create songplays table # create hash of timestmap userId and song for unique songplay ID # year and month columns exist for paritioning parquet files f_songplay_df = event_df.withColumn('songplay_id', F.sha1(F.concat_ws('|', 'timestamp', 'userId', 'song'))) \ .withColumn('year', F.year('timestamp')) \ .withColumn('month', F.month('timestamp')) \ .join(tmp_df, comparison, 'left') \ .select(['songplay_id', 'start_time', 'year', 'month', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent']) print('songplay fact record count:', f_songplay_df.count()) not_null_count = f_songplay_df.filter(F.col('song_id').isNotNull()).count() print('songplay fact records with song_id value:', not_null_count) return f_songplay_df