def test_local_min_max_clamp(self): tile = Tile(np.random.randint(-20, 20, (10, 10)), CellType.int8()) min_tile = Tile(np.random.randint(-20, 0, (10, 10)), CellType.int8()) max_tile = Tile(np.random.randint(0, 20, (10, 10)), CellType.int8()) df = self.spark.createDataFrame( [Row(t=tile, mn=min_tile, mx=max_tile)]) assert_equal( df.select(rf_local_min('t', 'mn')).first()[0].cells, np.clip(tile.cells, None, min_tile.cells)) assert_equal( df.select(rf_local_min('t', -5)).first()[0].cells, np.clip(tile.cells, None, -5)) assert_equal( df.select(rf_local_max('t', 'mx')).first()[0].cells, np.clip(tile.cells, max_tile.cells, None)) assert_equal( df.select(rf_local_max('t', 5)).first()[0].cells, np.clip(tile.cells, 5, None)) assert_equal( df.select(rf_local_clamp('t', 'mn', 'mx')).first()[0].cells, np.clip(tile.cells, min_tile.cells, max_tile.cells))
def test_cell_type_in_functions(self): from pyrasterframes.rf_types import CellType ct = CellType.float32().with_no_data_value(-999) df = self.rf.withColumn('ct_str', rf_convert_cell_type('tile', ct.cell_type_name)) \ .withColumn('ct', rf_convert_cell_type('tile', ct)) \ .withColumn('make', rf_make_constant_tile(99, 3, 4, CellType.int8())) \ .withColumn('make2', rf_with_no_data('make', 99)) result = df.select('ct', 'ct_str', 'make', 'make2').first() self.assertEqual(result['ct'].cell_type, ct) self.assertEqual(result['ct_str'].cell_type, ct) self.assertEqual(result['make'].cell_type, CellType.int8()) counts = df.select( rf_no_data_cells('make').alias("nodata1"), rf_data_cells('make').alias("data1"), rf_no_data_cells('make2').alias("nodata2"), rf_data_cells('make2').alias("data2")).first() self.assertEqual(counts["data1"], 3 * 4) self.assertEqual(counts["nodata1"], 0) self.assertEqual(counts["data2"], 0) self.assertEqual(counts["nodata2"], 3 * 4) self.assertEqual(result['make2'].cell_type, CellType.int8().with_no_data_value(99))
def test_rf_where(self): cond = Tile(np.random.binomial(1, 0.35, (10, 10)), CellType.uint8()) x = Tile(np.random.randint(-20, 10, (10, 10)), CellType.int8()) y = Tile(np.random.randint(0, 30, (10, 10)), CellType.int8()) df = self.spark.createDataFrame([Row(cond=cond, x=x, y=y)]) result = df.select(rf_where('cond', 'x', 'y')).first()[0].cells assert_equal(result, np.where(cond.cells, x.cells, y.cells))
def test_rf_rescale_per_tile(self): x1 = Tile(np.random.randint(-20, 42, (10, 10)), CellType.int8()) x2 = Tile(np.random.randint(20, 242, (10, 10)), CellType.int8()) df = self.spark.createDataFrame([Row(x=x1), Row(x=x2)]) result = df.select(rf_rescale('x').alias('x_prime')) \ .agg(rf_agg_stats('x_prime').alias('stat')) \ .select('stat.min', 'stat.max') \ .first() self.assertEqual(result[0], 0.0) self.assertEqual(result[1], 1.0)
def test_rf_standardize_per_tile(self): # 10k samples so should be pretty stable x = Tile(np.random.randint(-20, 0, (100, 100)), CellType.int8()) df = self.spark.createDataFrame([Row(x=x)]) result = df.select(rf_standardize('x').alias('z')) \ .select(rf_agg_stats('z').alias('z_stat')) \ .select('z_stat.mean', 'z_stat.variance') \ .first() self.assertAlmostEqual(result[0], 0.0) self.assertAlmostEqual(result[1], 1.0)
def test_rf_rescale(self): from pyspark.sql.functions import min as F_min from pyspark.sql.functions import max as F_max x1 = Tile(np.random.randint(-60, 12, (10, 10)), CellType.int8()) x2 = Tile(np.random.randint(15, 122, (10, 10)), CellType.int8()) df = self.spark.createDataFrame([Row(x=x1), Row(x=x2)]) # Note there will be some clipping rescaled = df.select(rf_rescale('x', -20, 50).alias('x_prime'), 'x') result = rescaled \ .agg( F_max(rf_tile_min('x_prime')), F_min(rf_tile_max('x_prime')) ).first() self.assertGreater( result[0], 0.0, f'Expected max tile_min to be > 0 (strictly); but it is ' f'{rescaled.select("x", "x_prime", rf_tile_min("x_prime")).take(2)}' ) self.assertLess( result[1], 1.0, f'Expected min tile_max to be < 1 (strictly); it is' f'{rescaled.select(rf_tile_max("x_prime")).take(2)}')
def test_agg_local_mean(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile # this is really testing the nodata propagation in the agg local summation ct = CellType.int8().with_no_data_value(4) df = self.spark.createDataFrame([ Row(tile=Tile(np.array([[1, 2, 3, 4, 5, 6]]), ct)), Row(tile=Tile(np.array([[1, 2, 4, 3, 5, 6]]), ct)), ]) result = df.agg(rf_agg_local_mean('tile').alias('mean')).first().mean expected = Tile(np.array([[1.0, 2.0, 3.0, 3.0, 5.0, 6.0]]), CellType.float64()) self.assertEqual(result, expected)