def guess_type_from_values_as_string(values, options): # Reproduces inferences available in Spark # PartitioningUtils.inferPartitionColumnValue() # located in org.apache.spark.sql.execution.datasources tested_types = ( IntegerType(), LongType(), DecimalType(), DoubleType(), TimestampType(), StringType() ) string_type = StringType() for tested_type in tested_types: type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options) try: for value in values: casted_value = type_caster(value) if casted_value is None and value not in ("null", None): raise ValueError return tested_type except ValueError: pass # Should never happen raise AnalysisException( "Unable to find a matching type for some fields, even StringType did not work" )
def test_session_create_data_frame_from_pandas_data_frame(self): try: # Pandas is an optional dependency # pylint: disable=import-outside-toplevel import pandas as pd except ImportError as e: raise ImportError("pandas is not importable") from e pdf = pd.DataFrame([(1, "one"), (2, "two"), (3, "three")]) df = self.spark.createDataFrame(pdf) self.assertEqual(df.count(), 3) self.assertListEqual(df.collect(), [ Row(**{ "0": 1, "1": 'one' }), Row(**{ "0": 2, "1": 'two' }), Row(**{ "0": 3, "2": 'three' }) ]) self.assertEqual( df.schema, StructType([ StructField("0", LongType(), True), StructField("1", StringType(), True) ]))
def test_cast_row_to_string(self): self.assertEqual( cast_to_string(Row(a=collections.OrderedDict([("value", None), ("b", { "c": 7 })]), b=None, c=True, d=5.2), StructType([ StructField( "a", MapType( StringType(), MapType(StringType(), LongType(), True), True), True), StructField("b", LongType(), True), StructField("c", BooleanType(), True), StructField("d", DoubleType(), True) ]), options=BASE_OPTIONS), "[[value ->, b -> [c -> 7]],, true, 5.2]")
def test_session_create_data_frame_from_list(self): df = self.spark.createDataFrame([ (1, "one"), (2, "two"), (3, "three"), ]) self.assertEqual(df.count(), 3) self.assertListEqual( df.collect(), [Row(_1=1, _2='one'), Row(_1=2, _2='two'), Row(_1=3, _2='three')]) self.assertEqual( df.schema, StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)]) )
def test_cast_float_to_long(self): self.assertEqual( cast_to_long(9223372036854775807, LongType(), options=BASE_OPTIONS), 9223372036854775807)
def test_cast_float_to_int_with_loop(self): self.assertEqual( cast_to_int(2147483648, LongType(), options=BASE_OPTIONS), -2147483648)
def test_cast_float_to_int(self): self.assertEqual( cast_to_int(2147483647, LongType(), options=BASE_OPTIONS), 2147483647)