def test_session_create_data_frame_from_list(self): df = self.spark.createDataFrame([(1, 'one'), (2, 'two'), (3, 'three')]) self.assertEqual(df.count(), 3) self.assertListEqual( df.collect(), [Row(_1=1, _2='one'), Row(_1=2, _2='two'), Row(_1=3, _2='three')], ) self.assertEqual( df.schema, StructType([StructField('_1', LongType(), True), StructField('_2', StringType(), True)]), )
def test_cast_row_to_string(self): self.assertEqual( cast_to_string( Row( a=collections.OrderedDict([('value', None), ('b', { 'c': 7 })]), b=None, c=True, d=5.2, ), StructType([ StructField( 'a', MapType( StringType(), MapType(StringType(), LongType(), True), True, ), True, ), StructField('b', LongType(), True), StructField('c', BooleanType(), True), StructField('d', DoubleType(), True), ]), options=BASE_OPTIONS, ), '[[value ->, b -> [c -> 7]],, true, 5.2]', )
def test_session_create_data_frame_from_pandas_data_frame(self): try: # Pandas is an optional dependency # pylint: disable=import-outside-toplevel import pandas as pd except ImportError: raise Exception('pandas is not importable') pdf = pd.DataFrame([(1, 'one'), (2, 'two'), (3, 'three')]) df = self.spark.createDataFrame(pdf) self.assertEqual(df.count(), 3) self.assertListEqual( df.collect(), [Row(**{'0': 1, '1': 'one'}), Row(**{'0': 2, '1': 'two'}), Row(**{'0': 3, '2': 'three'})], ) self.assertEqual( df.schema, StructType([StructField('0', LongType(), True), StructField('1', StringType(), True)]), )
def test_cast_to_struct(self): self.assertEqual( cast_to_struct( Row(character='Alice', day='28', month='8', year='2019'), from_type=StructType(fields=[ StructField('character', StringType()), StructField('day', StringType()), StructField('month', StringType()), StructField('year', StringType()), ]), to_type=StructType(fields=[ StructField('character', StringType()), StructField('day', IntegerType()), StructField('month', IntegerType()), StructField('year', IntegerType()), ]), options=BASE_OPTIONS, ), Row(character='Alice', day=28, month=8, year=2019), )
def create_counts_row(col1Item, rows): counts_row = [None] * (column_size + 1) def parse_row(row): column_index = distinct_col2[clean_element(row[1])] counts_row[int(column_index + 1)] = int(row[2]) rows.foreach(parse_row) # the value of col1 is the first value, the rest are the counts counts_row[0] = clean_element(col1Item) return Row(counts_row)
def test_column_stat_helper(): """ Expected quantile values come from use of org.apache.spark.sql.catalyst.util.QuantileSummaries """ schema = StructType([StructField('value', IntegerType())]) helper = ColumnStatHelper(col('value')) for i in range(1, 100001): helper.merge(Row(value=i), schema) helper.finalize() assert helper.count == 100000 assert helper.min == 1 assert helper.max == 100000 assert helper.mean == 50000.5 assert helper.stddev == 28867.65779668774 # sample standard deviation assert helper.get_quantile(0) == 1 assert helper.get_quantile(0.25) == 24998 assert helper.get_quantile(0.5) == 50000 assert helper.get_quantile(0.75) == 74993 assert helper.get_quantile(1) == 100000
def test_session_range(self): df = self.spark.range(3) self.assertEqual(df.count(), 3) self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)]) self.assertEqual(list(df.toLocalIterator()), [Row(id=0), Row(id=1), Row(id=2)])
def test_session_create_data_frame_from_list_with_schema(self): schema = StructType([StructField('map', MapType(StringType(), IntegerType()), True)]) df = self.spark.createDataFrame([({'a': 1},)], schema=schema) self.assertEqual(df.count(), 1) self.assertListEqual(df.collect(), [Row(map={'a': 1})]) self.assertEqual(df.schema, schema)