Example #1
0
 def check(actual, expected):
     actual_scols, actual_labels = actual
     expected_column_names, expected_labels = expected
     self.assertEqual(len(actual_scols), len(expected_column_names))
     for actual_scol, expected_column_name in zip(
             actual_scols, expected_column_names):
         expected_scol = sdf[expected_column_name]
         self.assertTrue(spark_column_equals(actual_scol,
                                             expected_scol))
     self.assertEqual(actual_labels, expected_labels)
Example #2
0
 def test_lit(self):
     self.assertTrue(
         spark_column_equals(SF.lit(np.int64(1)),
                             F.lit(1).astype(LongType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.int32(1)),
                             F.lit(1).astype(IntegerType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.int8(1)),
                             F.lit(1).astype(ByteType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.byte(1)),
                             F.lit(1).astype(ByteType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.float32(1)),
                             F.lit(float(1)).astype(FloatType())))
     self.assertTrue(spark_column_equals(SF.lit(1), F.lit(1)))
Example #3
0
    def test_from_pandas(self):
        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_spark_column_names,
                       [SPARK_DEFAULT_INDEX_NAME])
        self.assert_eq(internal.index_names, [None])
        self.assert_eq(internal.column_labels, [("a", ), ("b", )])
        self.assert_eq(internal.data_spark_column_names, ["a", "b"])
        self.assertTrue(
            spark_column_equals(internal.spark_column_for(("a", )), sdf["a"]))
        self.assertTrue(
            spark_column_equals(internal.spark_column_for(("b", )), sdf["b"]))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # non-string column name
        pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf1)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_spark_column_names,
                       [SPARK_DEFAULT_INDEX_NAME])
        self.assert_eq(internal.index_names, [None])
        self.assert_eq(internal.column_labels, [(0, ), (1, )])
        self.assert_eq(internal.data_spark_column_names, ["0", "1"])
        self.assertTrue(
            spark_column_equals(internal.spark_column_for((0, )), sdf["0"]))
        self.assertTrue(
            spark_column_equals(internal.spark_column_for((1, )), sdf["1"]))

        self.assert_eq(internal.to_pandas_frame, pdf1)

        # multi-index
        pdf.set_index("a", append=True, inplace=True)

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_spark_column_names,
            [SPARK_INDEX_NAME_FORMAT(0),
             SPARK_INDEX_NAME_FORMAT(1)],
        )
        self.assert_eq(internal.index_names, [None, ("a", )])
        self.assert_eq(internal.column_labels, [("b", )])
        self.assert_eq(internal.data_spark_column_names, ["b"])
        self.assertTrue(
            spark_column_equals(internal.spark_column_for(("b", )), sdf["b"]))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # multi-index columns
        pdf.columns = pd.MultiIndex.from_tuples([("x", "b")])

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_spark_column_names,
            [SPARK_INDEX_NAME_FORMAT(0),
             SPARK_INDEX_NAME_FORMAT(1)],
        )
        self.assert_eq(internal.index_names, [None, ("a", )])
        self.assert_eq(internal.column_labels, [("x", "b")])
        self.assert_eq(internal.data_spark_column_names, ["(x, b)"])
        self.assertTrue(
            spark_column_equals(internal.spark_column_for(("x", "b")),
                                sdf["(x, b)"]))

        self.assert_eq(internal.to_pandas_frame, pdf)