Example #1
0
    def test_coalescing_heavy_type_mismatch(self):
        first_df = self.spark.createDataFrame(
            data=[(1, None), (2, 'hi'), (3, None), (4, 'may')],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.StringType()),
            ]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, [
                2,
            ]), (3, [
                3,
            ]), (4, None)],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.ArrayType(T.IntegerType())),
            ]),
        )

        with self.assertRaises(U.AnalysisException):
            SF.multijoin([first_df, second_df],
                         on='id',
                         how='inner',
                         coalesce=['value'])
Example #2
0
    def test_coalescing(self):
        first_df = self.spark.createDataFrame(
            data=[(1, None), (2, 'hi'), (3, None), (4, 'may')],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.StringType()),
            ]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, 'hey'), (3, 'you'), (4, None)],
            schema=T.StructType([
                T.StructField('id', T.IntegerType()),
                T.StructField('value', T.StringType()),
            ]),
        )

        joined_df = SF.multijoin([first_df, second_df],
                                 on='id',
                                 how='inner',
                                 coalesce=['value'])

        self.assertDataFrameEqual(
            joined_df,
            [{
                'id': 2,
                'value': 'hi'
            }, {
                'id': 3,
                'value': 'you'
            }, {
                'id': 4,
                'value': 'may'
            }],
        )
Example #3
0
    def test_outer_join(self):
        first_df = self.spark.createDataFrame(
            data=[(1, ), (2, ), (3, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )
        second_df = self.spark.createDataFrame(
            data=[(2, ), (3, ), (4, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )
        third_df = self.spark.createDataFrame(
            data=[(3, ), (4, ), (5, )],
            schema=T.StructType([T.StructField('id', T.IntegerType())]),
        )

        joined_df = SF.multijoin([first_df, second_df, third_df], on='id', how='outer')

        self.assertDataFrameEqual(joined_df, [{'id': i} for i in [1, 2, 3, 4, 5]])
Example #4
0
 def test_no_dataframes_in_the_input(self):
     joined_df = SF.multijoin([])
     self.assertIsNone(joined_df)