def test_ols_equivalence(self): # Simple ols problem # y = a * x + b + r # Where r ~ N(0, 1) n = 40 a = 0.27 b = 1.2 x = np.arange(0, n) r = np.random.normal(0, 1, n) y = (a * x + b + r).reshape(n, 1) features = x.reshape(n, 1) features = np.concatenate([features, np.ones_like(features)], axis=1) df = self.spark.createDataFrame( [(Vectors.dense(y[i]), Matrices.dense(1, 2, features[i])) for i in range(n)], ["measurement", "measurementModel"]) lkf = LinearKalmanFilter()\ .setInitialStateMean(Vectors.dense(0.0, 0.0))\ .setMeasurementModelCol("measurementModel")\ .setMeasurementCol("measurement")\ .setInitialStateCovariance(Matrices.dense(2, 2, (np.eye(2)*10).reshape(4, 1)))\ .setProcessModel(Matrices.dense(2, 2, np.eye(2).reshape(4, 1)))\ .setProcessNoise(Matrices.dense(2, 2, np.zeros(4)))\ .setMeasurementNoise(Matrices.dense(1, 1, [10E-5])) model = lkf.transform(df) state = model.filter( "stateIndex = {}".format(n)).collect()[0].state.mean.values # Check equivalence with least squares solution with numpy expected, _, _, _ = np.linalg.lstsq(features, y, rcond=None) np.testing.assert_array_almost_equal(state, expected.reshape(2), decimal=5)
def test_multiple_model_adaptive_filter(self): n = 100 a = 0.27 b = 1.2 x = np.concatenate([np.arange(0, n), np.arange(0, n)]) r = np.random.normal(0, 1, n * 2) y = (a * x + b + r).reshape(n * 2, 1) features = x.reshape(n * 2, 1) features = np.concatenate([features, np.ones_like(features)], axis=1) state_keys = ["1"] * n + ["2"] * n df = self.spark.createDataFrame( [(state_keys[i], Vectors.dense( y[i]), Matrices.dense(1, 2, features[i])) for i in range(n * 2)], ["state_key", "measurement", "measurementModel"]) mmaeFilter = LinearKalmanFilter(2, 1)\ .setStateKeyCol("state_key")\ .setMeasurementModelCol("measurementModel")\ .setMeasurementCol("measurement")\ .setInitialCovariance(Matrices.dense(2, 2, (np.eye(2)*10).reshape(4, 1)))\ .setProcessModel(Matrices.dense(2, 2, np.eye(2).reshape(4, 1)))\ .setProcessNoise(Matrices.dense(2, 2, np.zeros(4)))\ .setMeasurementNoise(Matrices.dense(1, 1, [1.0]))\ .setSlidingLikelihoodWindow(5)\ .setEnableMultipleModelAdaptiveEstimation() model = mmaeFilter.transform(df) state = model.filter( "stateIndex = {}".format(n)).collect()[0].state.values expected, _, _, _ = np.linalg.lstsq(features, y, rcond=None) np.testing.assert_array_almost_equal(state, expected.reshape(2), decimal=0)
def test_matrix_multiply(self): mat1 = Matrices.dense(2, 2, [1.0, 2.0, 3.0, 4.0]) mat2 = Matrices.dense(2, 2, [4.0, 5.0, 6.0, 7.0]) result = self.spark.createDataFrame([(mat1, mat2)], ["mat1", "mat2"])\ .withColumn("result", multiplyMatrix("mat1", "mat2"))\ .select("result").head().result np.testing.assert_array_almost_equal( result.toArray(), np.dot(mat1.toArray(), mat2.toArray()))
def test_matrix_projection(self): mat = Matrices.dense(2, 2, [1.0, 2.0, 3.0, 4.0]) proj = Matrices.dense(1, 2, [4.0, 5.0]) result = self.spark.createDataFrame([(mat, proj)], ["mat", "proj"]) \ .withColumn("result", projectMatrix("mat", "proj")) \ .select("result").head().result expected = np.dot(np.dot(proj.toArray(), mat.toArray()), proj.toArray().transpose()) np.testing.assert_array_almost_equal(result.toArray(), expected)
def test_multinomial_logistic_regression_with_bound(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) lor = LogisticRegression( regParam=0.01, lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0), ) model = lor.fit(df) expected = [ [4.593, 4.5516, 9.0099, 12.2904], [1.0, 8.1093, 7.0, 10.0], [3.041, 5.0, 8.0, 11.0], ] for i in range(0, len(expected)): self.assertTrue( np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1e-4)) self.assertTrue( np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1e-4))
def test_matrix_array(self): mat = Matrices.dense(2, 2, [1.0, 2.0, 3.0, 4.0]) arr = self.spark.createDataFrame([(mat,)], ["mat"]) \ .withColumn("arr", matrixToArray("mat")) \ .select("arr").head().arr np.testing.assert_array_almost_equal( np.array(arr.values), mat.toArray().reshape(4, order="F"))
def test_matrix_vector_multiply(self): mat = Matrices.dense(2, 2, [1.0, 2.0, 3.0, 4.0]) vec = Vectors.dense(1.0, 2.0) result = self.spark.createDataFrame([(mat, vec)], ["mat", "vec"]) \ .withColumn("result", multiplyMatrixVector("mat", "vec")) \ .select("result").head().result np.testing.assert_array_almost_equal( result.toArray(), np.dot(mat.toArray(), vec.toArray()))
def do_spark(): #sparky = SparkSession.builder.getOrCreate() #df = sparky.sql('''select 'spark' as hello ''') v = Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0) #dm1 = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) #dm2 = Matrices.dense(2, 3, [7, 8, 9, 10, 11, 12]) #mat = sc.io.mmread('494_bus.mtx') #Something bigger with matrices n = 50 s1 = np.random.normal(0, 1, n * n) s2 = np.random.normal(0, 1, n * n) dm3 = Matrices.dense(n, n, s1) dm4 = Matrices.dense(n, n, s2) dm3s = dm3.toSparse() dm4s = dm4.toSparse() return "{}".format(v.norm(2))
def test_binomial_logistic_regression_with_bound(self): df = self.spark.createDataFrame( [(1.0, 1.0, Vectors.dense(0.0, 5.0)), (0.0, 2.0, Vectors.dense(1.0, 2.0)), (1.0, 3.0, Vectors.dense(2.0, 1.0)), (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"]) lor = LogisticRegression(regParam=0.01, weightCol="weight", lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), upperBoundsOnIntercepts=Vectors.dense(0.0)) model = lor.fit(df) self.assertTrue( np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
def test_persistance(self): filter = LinearKalmanFilter() \ .setStateSize(2) \ .setInitialStateMean(Vectors.dense([0.0, 0.0])) \ .setInitialStateCovariance(Matrices.dense(2, 2, [1.0, 0.0, 0.0, 0.0]))\ path = tempfile.mkdtemp() model_path = os.path.join(path, "lkf") filter.save(model_path) loaded = LinearKalmanFilter.load(model_path) assert (loaded.getInitialStateMean() == filter.getInitialStateMean()) assert (loaded.getInitialStateCovariance() == filter.getInitialStateCovariance()) assert (loaded.getStateSize() == filter.getStateSize())
def test_batch_save_and_resume(self): n = 100 ts = np.arange(0, n) zs = np.random.normal(0, 1, n) + ts split_point = n // 2 initial = zs[:split_point] remaining = zs[split_point:] filter = LinearKalmanFilter()\ .setMeasurementCol("measurement")\ .setInitialStateMean( Vectors.dense([0.0, 0.0]))\ .setInitialStateCovariance( Matrices.dense(2, 2, [1, 0, 0, 1]))\ .setProcessModel( Matrices.dense(2, 2, [1, 0, 1, 1]))\ .setProcessNoise( Matrices.dense(2, 2, [0.01, 0.0, 0.0, 0.01]))\ .setMeasurementNoise( Matrices.dense(1, 1, [1]))\ .setMeasurementModel( Matrices.dense(1, 2, [1, 0])) initial_filter = filter.setInitialStateCovariance( Matrices.dense(2, 2, [1000.0, 0.0, 0.0, 1000.0])) def create_df(m): return self.spark.createDataFrame([(Vectors.dense(m[i]), ) for i in range(len(m))], ["measurement"]) initial_measurements = create_df(initial) complete_measurements = create_df(zs) initial_state = initial_filter.transform(initial_measurements)\ .filter("stateIndex == {}".format(len(initial)))\ .select("stateKey", "state") complete_state = initial_filter.transform(complete_measurements) \ .filter("stateIndex == {}".format(len(zs)))\ .select("stateKey", "state") restarted_filter = filter\ .setInitialStateDistributionCol("state") remaining_measurements = create_df(remaining)\ .crossJoin(initial_state) restarted_state = restarted_filter.transform(remaining_measurements)\ .filter("stateIndex == {}".format(n - split_point))\ .select("stateKey", "state") assert (restarted_state.collect() == complete_state.collect())
def test_multinomial_logistic_regression_with_bound(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) lor = LogisticRegression(regParam=0.01, lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0)) model = lor.fit(df) expected = [[4.593, 4.5516, 9.0099, 12.2904], [1.0, 8.1093, 7.0, 10.0], [3.041, 5.0, 8.0, 11.0]] for i in range(0, len(expected)): self.assertTrue( np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4)) self.assertTrue( np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
mps = int(sys.argv[2]) spark = SparkSession.builder.appName("RateSourceLKF").getOrCreate() spark.sparkContext.setLogLevel("WARN") noise_param = 1 input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\ .withColumn("mod", F.col("value") % num_states)\ .withColumn("stateKey", F.col("mod").cast("String"))\ .withColumn("trend", (F.col("value")/num_states).cast("Integer") + F.randn() * noise_param) lkf = LinearKalmanFilter(2, 1)\ .setStateKeyCol("stateKey")\ .setMeasurementCol("measurement")\ .setInitialCovariance(Matrices.dense(2, 2, [10000.0, 0.0, 0.0, 10000.0]))\ .setProcessModel(Matrices.dense(2, 2, [1.0, 0.0, 1.0, 1.0]))\ .setProcessNoise(Matrices.dense(2, 2, [0.0001, 0.0, 0.0, 0.0001]))\ .setMeasurementNoise(Matrices.dense(1, 1, [noise_param]))\ .setMeasurementModel(Matrices.dense(1, 2, [1.0, 0.0])) assembler = VectorAssembler(inputCols=["trend"], outputCol="measurement") measurements = assembler.transform(input_df) query = lkf.transform(measurements)\ .writeStream\ .queryName("RateSourceLKF")\ .outputMode("append")\ .format("console")\ .start()
num_states = int(sys.argv[1]) mps = int(sys.argv[2]) spark = SparkSession.builder.appName("LKFRateSourceOLS").getOrCreate() spark.sparkContext.setLogLevel("WARN") # OLS problem, states to be estimated are a, b and c # z = a*x + b * y + c + w, where w ~ N(0, 1) a = 0.5 b = 0.2 c = 1.2 noise_param = 1 label_udf = F.udf(lambda x, y, w: Vectors.dense([x * a + y * b + c + w]), VectorUDT()) features_udf = F.udf(lambda x, y: Matrices.dense(1, 3, [x, y, 1]), MatrixUDT()) features = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\ .withColumn("mod", F.col("value") % num_states)\ .withColumn("stateKey", F.col("mod").cast("String"))\ .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\ .withColumn("y", F.sqrt("x"))\ .withColumn("w", F.randn(0) * noise_param)\ .withColumn("label", label_udf("x", "y", "w"))\ .withColumn("features", features_udf("x", "y")) lkf = LinearKalmanFilter()\ .setStateKeyCol("stateKey")\ .setMeasurementCol("label")\ .setMeasurementModelCol("features") \