Beispiel #1
0
    def test_udf_as_join_condition(self):
        left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)])
        right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)])
        f = udf(lambda a: a, IntegerType())

        df = left.join(right, [f("a") == f("b"), left.a1 == right.b1])
        self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)])
Beispiel #2
0
    def test_udf_in_generate(self):
        from pyspark.sql.functions import udf, explode
        df = self.spark.range(5)
        f = udf(lambda x: list(range(x)), ArrayType(LongType()))
        row = df.select(explode(f(*df))).groupBy().sum().first()
        self.assertEqual(row[0], 10)

        df = self.spark.range(3)
        res = df.select("id", explode(f(df.id))).collect()
        self.assertEqual(res[0][0], 1)
        self.assertEqual(res[0][1], 0)
        self.assertEqual(res[1][0], 2)
        self.assertEqual(res[1][1], 0)
        self.assertEqual(res[2][0], 2)
        self.assertEqual(res[2][1], 1)

        range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType()))
        res = df.select("id", explode(range_udf(df.id))).collect()
        self.assertEqual(res[0][0], 0)
        self.assertEqual(res[0][1], -1)
        self.assertEqual(res[1][0], 0)
        self.assertEqual(res[1][1], 0)
        self.assertEqual(res[2][0], 1)
        self.assertEqual(res[2][1], 0)
        self.assertEqual(res[3][0], 1)
        self.assertEqual(res[3][1], 1)
Beispiel #3
0
 def test_nondeterministic_udf3(self):
     # regression test for SPARK-23233
     f = udf(lambda x: x)
     # Here we cache the JVM UDF instance.
     self.spark.range(1).select(f("id"))
     # This should reset the cache to set the deterministic status correctly.
     f = f.asNondeterministic()
     # Check the deterministic status of udf.
     df = self.spark.range(1).select(f("id"))
     deterministic = df._jdf.logicalPlan().projectList().head().deterministic()
     self.assertFalse(deterministic)
Beispiel #4
0
 def test_nondeterministic_udf3(self):
     # regression test for SPARK-23233
     f = udf(lambda x: x)
     # Here we cache the JVM UDF instance.
     self.spark.range(1).select(f("id"))
     # This should reset the cache to set the deterministic status correctly.
     f = f.asNondeterministic()
     # Check the deterministic status of udf.
     df = self.spark.range(1).select(f("id"))
     deterministic = df._jdf.logicalPlan().projectList().head().deterministic()
     self.assertFalse(deterministic)
Beispiel #5
0
 def test_udf_in_filter_on_top_of_join(self):
     # regression test for SPARK-18589
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(b=1)])
     f = udf(lambda a, b: a == b, BooleanType())
     df = left.crossJoin(right).filter(f("a", "b"))
     self.assertEqual(df.collect(), [Row(a=1, b=1)])
Beispiel #6
0
 def test_udf_in_subquery(self):
     f = udf(lambda x: x, "long")
     with self.tempView("v"):
         self.spark.range(1).filter(f("id") >= 0).createTempView("v")
         sql = self.spark.sql
         result = sql("select i from values(0L) as data(i) where i in (select id from v)")
         self.assertEqual(result.collect(), [Row(i=0)])
Beispiel #7
0
 def test_udf_in_filter_on_top_of_join(self):
     # regression test for SPARK-18589
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(b=1)])
     f = udf(lambda a, b: a == b, BooleanType())
     df = left.crossJoin(right).filter(f("a", "b"))
     self.assertEqual(df.collect(), [Row(a=1, b=1)])
Beispiel #8
0
 def test_udf_in_subquery(self):
     f = udf(lambda x: x, "long")
     with self.tempView("v"):
         self.spark.range(1).filter(f("id") >= 0).createTempView("v")
         sql = self.spark.sql
         result = sql("select i from values(0L) as data(i) where i in (select id from v)")
         self.assertEqual(result.collect(), [Row(i=0)])
Beispiel #9
0
 def runWithJoinType(join_type, type_string):
     with self.assertRaisesRegex(
             AnalysisException,
             """Python UDF in the ON clause of a %s JOIN.""" %
             type_string,
     ):
         left.join(right, [f("a", "b"), left.a1 == right.b1],
                   join_type).collect()
Beispiel #10
0
 def runWithJoinType(join_type, type_string):
     with self.assertRaisesRegex(
             AnalysisException,
             """Using PythonUDF in join condition of join type "%s" is not supported"""
             % type_string,
     ):
         left.join(right, [f("a", "b"), left.a1 == right.b1],
                   join_type).collect()
Beispiel #11
0
 def test_udf_and_common_filter_in_join_condition(self):
     # regression test for SPARK-25314
     # test the complex scenario with both udf and common filter
     left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)])
     right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)])
     f = udf(lambda a, b: a == b, BooleanType())
     df = left.join(right, [f("a", "b"), left.a1 == right.b1])
     # do not need spark.sql.crossJoin.enabled=true for udf is not the only join condition.
     self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)])
Beispiel #12
0
 def test_udf_and_common_filter_in_join_condition(self):
     # regression test for SPARK-25314
     # test the complex scenario with both udf and common filter
     left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)])
     right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)])
     f = udf(lambda a, b: a == b, BooleanType())
     df = left.join(right, [f("a", "b"), left.a1 == right.b1])
     # do not need spark.sql.crossJoin.enabled=true for udf is not the only join condition.
     self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)])
Beispiel #13
0
 def test_udf_in_join_condition(self):
     # regression test for SPARK-25314
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(b=1)])
     f = udf(lambda a, b: a == b, BooleanType())
     df = left.join(right, f("a", "b"))
     with self.assertRaisesRegexp(AnalysisException, 'Detected implicit cartesian product'):
         df.collect()
     with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
         self.assertEqual(df.collect(), [Row(a=1, b=1)])
Beispiel #14
0
 def test_udf_in_join_condition(self):
     # regression test for SPARK-25314
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(b=1)])
     f = udf(lambda a, b: a == b, BooleanType())
     df = left.join(right, f("a", "b"))
     with self.assertRaisesRegexp(AnalysisException, 'Detected implicit cartesian product'):
         df.collect()
     with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
         self.assertEqual(df.collect(), [Row(a=1, b=1)])
Beispiel #15
0
 def test_udf_in_left_outer_join_condition(self):
     # regression test for SPARK-26147
     from pyspark.sql.functions import col
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(b=1)])
     f = udf(lambda a: str(a), StringType())
     # The join condition can't be pushed down, as it refers to attributes from both sides.
     # The Python UDF only refer to attributes from one side, so it's evaluable.
     df = left.join(right, f("a") == col("b").cast("string"), how="left_outer")
     with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
         self.assertEqual(df.collect(), [Row(a=1, b=1)])
Beispiel #16
0
 def test_udf_in_left_outer_join_condition(self):
     # regression test for SPARK-26147
     from pyspark.sql.functions import col
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(b=1)])
     f = udf(lambda a: str(a), StringType())
     # The join condition can't be pushed down, as it refers to attributes from both sides.
     # The Python UDF only refer to attributes from one side, so it's evaluable.
     df = left.join(right, f("a") == col("b").cast("string"), how="left_outer")
     with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
         self.assertEqual(df.collect(), [Row(a=1, b=1)])
Beispiel #17
0
def detect_barometric_anamoly(barometric_reading, TimeStamp):
    '''
        Driver function to detect barometric anamolies
    '''
    barometric_reading = np.asarray(barometric_reading)
    TimeStamp = np.asarray(TimeStamp)
    try:
        if np.amin(barometric_reading) > 370:
            return False
        elif np.amin(barometric_reading) < 0:
            return False
        else:
            # Time at which the drone height is lowest
            sorted_TimeStamp = TimeStamp.argsort()
            barometric_reading = barometric_reading[sorted_TimeStamp]
            TimeStamp = TimeStamp[sorted_TimeStamp]

            Minimum_time = TimeStamp[np.where(
                barometric_reading == np.amin(barometric_reading))]
            Mid_time = TimeStamp[int(TimeStamp.size / 2)]

            # Define window size that you wanna pick out the data
            # i.e. window = [Minimum_time-Window_Size_Secs:Minimum_time]
            Window_Size_Secs = 10
            sliced_barometric = barometric_reading[np.where((TimeStamp > (Minimum_time - Window_Size_Secs)) & \
                                                             (TimeStamp < (Minimum_time + Window_Size_Secs)))]
            sliced_TimeStamp = TimeStamp[np.where((TimeStamp > (Minimum_time - Window_Size_Secs)) & \
                                                (TimeStamp < (Minimum_time + Window_Size_Secs)))]

            # generate expected malfunctioning device data
            length_array = TimeStamp[np.where((TimeStamp > (Mid_time - Window_Size_Secs)) & \
                                                (TimeStamp < (Mid_time + Window_Size_Secs)))].size
            anomalous_event, ts = get_anomalous_event(length_array)
            sliced_anomalous = anomalous_event[np.where((ts > (np.amin(sliced_TimeStamp) - Minimum_time)[0]) & \
                                                         (ts <= (np.amax(sliced_TimeStamp) - Minimum_time)[0]))]
            sliced_ts = ts[np.where((ts > (np.amin(sliced_TimeStamp) - Minimum_time)[0]) & \
                                                         (ts <= (np.amax(sliced_TimeStamp) - Minimum_time)[0]))]
            f = interpolate.interp1d(np.linspace(0, 1, len(sliced_anomalous)),
                                     sliced_anomalous)

            x = np.linspace(0, 1, sliced_barometric.size)
            compare_anomalous = f(x)

            Error = RMSE((sliced_barometric), compare_anomalous)

            if Error < 11:
                return True
            else:
                return False

    except ValueError:
        return False
Beispiel #18
0
 def test_udf_in_join_condition(self):
     # regression test for SPARK-25314
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(b=1)])
     f = udf(lambda a, b: a == b, BooleanType())
     # The udf uses attributes from both sides of join, so it is pulled out as Filter +
     # Cross join.
     df = left.join(right, f("a", "b"))
     with self.sql_conf({"spark.sql.crossJoin.enabled": False}):
         with self.assertRaisesRegex(AnalysisException, 'Detected implicit cartesian product'):
             df.collect()
     with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
         self.assertEqual(df.collect(), [Row(a=1, b=1)])
Beispiel #19
0
    def test_udf_defers_judf_initialization(self):
        # This is separate of  UDFInitializationTests
        # to avoid context initialization
        # when udf is called
        f = UserDefinedFunction(lambda x: x, StringType())

        self.assertIsNone(
            f._judf_placeholder, "judf should not be initialized before the first call."
        )

        self.assertIsInstance(f("foo"), Column, "UDF call should return a Column.")

        self.assertIsNotNone(
            f._judf_placeholder, "judf should be initialized after UDF has been called."
        )
Beispiel #20
0
    def test_udf_defers_judf_initialization(self):
        # This is separate of  UDFInitializationTests
        # to avoid context initialization
        # when udf is called
        f = UserDefinedFunction(lambda x: x, StringType())

        self.assertIsNone(
            f._judf_placeholder,
            "judf should not be initialized before the first call."
        )

        self.assertIsInstance(f("foo"), Column, "UDF call should return a Column.")

        self.assertIsNotNone(
            f._judf_placeholder,
            "judf should be initialized after UDF has been called."
        )
Beispiel #21
0
 def runWithJoinType(join_type, type_string):
     with self.assertRaisesRegexp(
             AnalysisException,
             'Using PythonUDF.*%s is not supported.' % type_string):
         left.join(right, [f("a", "b"), left.a1 == right.b1], join_type).collect()
Beispiel #22
0
    def test_udf_globals_not_overwritten(self):
        @udf('string')
        def f():
            assert "itertools" not in str(map)

        self.spark.range(1).select(f()).collect()
Beispiel #23
0
def transform(self, f):
    return f(self)
    byte_gen,
    short_gen,
    int_gen,
    long_gen,
    float_gen,
    double_gen,
    string_gen,
    boolean_gen,
    date_gen,
    # we are limiting TimestampGen to avoid overflowing the INT96 value
    # see https://github.com/rapidsai/cudf/issues/8070
    limited_timestamp()
]

parquet_basic_map_gens = [
    MapGen(f(nullable=False), f()) for f in [
        BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen,
        DoubleGen, DateGen, limited_timestamp
    ]
] + [
    simple_string_to_string_map_gen,
    MapGen(DecimalGen(20, 2, nullable=False), decimal_gen_128bit)
]

parquet_struct_gen_no_maps = [
    StructGen([['child' + str(ind), sub_gen]
               for ind, sub_gen in enumerate(parquet_basic_gen)]),
    StructGen([['child0', StructGen([['child1', byte_gen]])]])
]

parquet_struct_of_map_gen = StructGen(
Beispiel #25
0
array_gens_sample = single_level_array_gens + nested_array_gens_sample

# all of the basic types in a single struct
all_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens)])

# Some struct gens, but not all because of nesting
nonempty_struct_gens_sample = [all_basic_struct_gen,
        StructGen([['child0', byte_gen], ['child1', all_basic_struct_gen]]),
        StructGen([['child0', ArrayGen(short_gen)], ['child1', double_gen]])]

struct_gens_sample = nonempty_struct_gens_sample + [StructGen([])]

simple_string_to_string_map_gen = MapGen(StringGen(pattern='key_[0-9]', nullable=False),
        StringGen(), max_length=10)

all_basic_map_gens = [MapGen(f(nullable=False), f()) for f in [BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, DateGen, TimestampGen]] + [simple_string_to_string_map_gen]

# Some map gens, but not all because of nesting
map_gens_sample = all_basic_map_gens + [MapGen(StringGen(pattern='key_[0-9]', nullable=False), ArrayGen(string_gen), max_length=10),
        MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10),
        MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen)]

allow_negative_scale_of_decimal_conf = {'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}

no_nans_conf = {'spark.rapids.sql.hasNans': 'false'}

def copy_and_update(conf, *more_confs):
    local_conf = conf.copy()
    for more in more_confs:
        local_conf.update(more)
    return local_conf
Beispiel #26
0
    def test_udf_globals_not_overwritten(self):
        @udf('string')
        def f():
            assert "itertools" not in str(map)

        self.spark.range(1).select(f()).collect()