def test_udt(self): from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _make_type_verifier def check_datatype(datatype): pickled = pickle.loads(pickle.dumps(datatype)) assert datatype == pickled scala_datatype = self.spark._jsparkSession.parseDataType(datatype.json()) python_datatype = _parse_datatype_json_string(scala_datatype.json()) assert datatype == python_datatype check_datatype(ExamplePointUDT()) structtype_with_udt = StructType([StructField("label", DoubleType(), False), StructField("point", ExamplePointUDT(), False)]) check_datatype(structtype_with_udt) p = ExamplePoint(1.0, 2.0) self.assertEqual(_infer_type(p), ExamplePointUDT()) _make_type_verifier(ExamplePointUDT())(ExamplePoint(1.0, 2.0)) self.assertRaises(ValueError, lambda: _make_type_verifier(ExamplePointUDT())([1.0, 2.0])) check_datatype(PythonOnlyUDT()) structtype_with_udt = StructType([StructField("label", DoubleType(), False), StructField("point", PythonOnlyUDT(), False)]) check_datatype(structtype_with_udt) p = PythonOnlyPoint(1.0, 2.0) self.assertEqual(_infer_type(p), PythonOnlyUDT()) _make_type_verifier(PythonOnlyUDT())(PythonOnlyPoint(1.0, 2.0)) self.assertRaises( ValueError, lambda: _make_type_verifier(PythonOnlyUDT())([1.0, 2.0]))
def test_udt(self): from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _verify_type from pyspark.sql.tests import ExamplePointUDT, ExamplePoint def check_datatype(datatype): pickled = pickle.loads(pickle.dumps(datatype)) assert datatype == pickled scala_datatype = self.sqlCtx._ssql_ctx.parseDataType(datatype.json()) python_datatype = _parse_datatype_json_string(scala_datatype.json()) assert datatype == python_datatype check_datatype(ExamplePointUDT()) structtype_with_udt = StructType([StructField("label", DoubleType(), False), StructField("point", ExamplePointUDT(), False)]) check_datatype(structtype_with_udt) p = ExamplePoint(1.0, 2.0) self.assertEqual(_infer_type(p), ExamplePointUDT()) _verify_type(ExamplePoint(1.0, 2.0), ExamplePointUDT()) self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], ExamplePointUDT())) check_datatype(PythonOnlyUDT()) structtype_with_udt = StructType([StructField("label", DoubleType(), False), StructField("point", PythonOnlyUDT(), False)]) check_datatype(structtype_with_udt) p = PythonOnlyPoint(1.0, 2.0) self.assertEqual(_infer_type(p), PythonOnlyUDT()) _verify_type(PythonOnlyPoint(1.0, 2.0), PythonOnlyUDT()) self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], PythonOnlyUDT()))
def test_infer_binary_type(self): binaryrow = [Row(f1='a', f2=b"abcd")] df = self.sc.parallelize(binaryrow).toDF() self.assertEqual(df.schema.fields[1].dataType, BinaryType()) # this saving as Parquet caused issues as well. output_dir = os.path.join(self.tempdir.name, "infer_binary_type") df.write.parquet(output_dir) df1 = self.spark.read.parquet(output_dir) self.assertEqual('a', df1.first().f1) self.assertEqual(b"abcd", df1.first().f2) self.assertEqual(_infer_type(b""), BinaryType()) self.assertEqual(_infer_type(b"1234"), BinaryType())
def infer_schema(self,rec): """infers dataframe schema for a record. Assumes every dict is a Struct, not a Map""" if isinstance(rec, dict): return pst.StructType([pst.StructField(key, self.infer_schema(value), True) for key, value in sorted(rec.items())]) elif isinstance(rec, list): if len(rec) == 0: raise ValueError("can't infer type of an empty list") elem_type = self.infer_schema(rec[0]) for elem in rec: this_type = self.infer_schema(elem) if elem_type != this_type: raise ValueError("can't infer type of a list with inconsistent elem types") return pst.ArrayType(elem_type) else: return pst._infer_type(rec)
def test_infer_long_type(self): longrow = [Row(f1='a', f2=100000000000000)] df = self.sc.parallelize(longrow).toDF() self.assertEqual(df.schema.fields[1].dataType, LongType()) # this saving as Parquet caused issues as well. output_dir = os.path.join(self.tempdir.name, "infer_long_type") df.saveAsParquetFile(output_dir) df1 = self.sqlCtx.parquetFile(output_dir) self.assertEquals('a', df1.first().f1) self.assertEquals(100000000000000, df1.first().f2) self.assertEqual(_infer_type(1), LongType()) self.assertEqual(_infer_type(2**10), LongType()) self.assertEqual(_infer_type(2**20), LongType()) self.assertEqual(_infer_type(2**31 - 1), LongType()) self.assertEqual(_infer_type(2**31), LongType()) self.assertEqual(_infer_type(2**61), LongType()) self.assertEqual(_infer_type(2**71), LongType())
def test_infer_long_type(self): longrow = [Row(f1='a', f2=100000000000000)] df = self.sc.parallelize(longrow).toDF() self.assertEqual(df.schema.fields[1].dataType, LongType()) # this saving as Parquet caused issues as well. output_dir = os.path.join(self.tempdir.name, "infer_long_type") df.saveAsParquetFile(output_dir) df1 = self.sqlCtx.parquetFile(output_dir) self.assertEquals('a', df1.first().f1) self.assertEquals(100000000000000, df1.first().f2) self.assertEqual(_infer_type(1), LongType()) self.assertEqual(_infer_type(2**10), LongType()) self.assertEqual(_infer_type(2**20), LongType()) self.assertEqual(_infer_type(2**31 - 1), LongType()) self.assertEqual(_infer_type(2**31), LongType()) self.assertEqual(_infer_type(2**61), LongType()) self.assertEqual(_infer_type(2**71), LongType())
def infer_schema(record): """infers dataframe schema for a record. Assumes every dict is a Struct, not a Map""" if isinstance(record, dict): # sorted(record.items() -> (key, value) needed to be an instance of list of tuples and not dict_items. # It needs to be sorted so the inferred struct will be const per record type return pst.StructType([ pst.StructField(key, infer_schema(value), True) for key, value in sorted(record.items()) ]) elif isinstance(record, list): if len(record) == 0: raise ValueError("can't infer type of an empty list") element_type = infer_schema(record[0]) for element in record: this_type = infer_schema(element) if element_type != this_type: raise ValueError( "can't infer type of a list with inconsistent element types" ) return pst.ArrayType(element_type) else: return pst._infer_type(record)