Ejemplo n.º 1
0
    def test_ols_equivalence(self):
        # Simple ols problem
        # y =  a * x + b + r
        # Where r ~ N(0, 1)
        n = 40
        a = 0.27
        b = 1.2
        x = np.arange(0, n)
        r = np.random.normal(0, 1, n)
        y = (a * x + b + r).reshape(n, 1)
        features = x.reshape(n, 1)
        features = np.concatenate([features, np.ones_like(features)], axis=1)
        df = self.spark.createDataFrame(
            [(Vectors.dense(y[i]), Matrices.dense(1, 2, features[i]))
             for i in range(n)], ["measurement", "measurementModel"])
        lkf = LinearKalmanFilter()\
            .setInitialStateMean(Vectors.dense(0.0, 0.0))\
            .setMeasurementModelCol("measurementModel")\
            .setMeasurementCol("measurement")\
            .setInitialStateCovariance(Matrices.dense(2, 2, (np.eye(2)*10).reshape(4, 1)))\
            .setProcessModel(Matrices.dense(2, 2, np.eye(2).reshape(4, 1)))\
            .setProcessNoise(Matrices.dense(2, 2, np.zeros(4)))\
            .setMeasurementNoise(Matrices.dense(1, 1, [10E-5]))

        model = lkf.transform(df)
        state = model.filter(
            "stateIndex = {}".format(n)).collect()[0].state.mean.values

        # Check equivalence with least squares solution with numpy
        expected, _, _, _ = np.linalg.lstsq(features, y, rcond=None)
        np.testing.assert_array_almost_equal(state,
                                             expected.reshape(2),
                                             decimal=5)
Ejemplo n.º 2
0
    def test_multiple_model_adaptive_filter(self):
        n = 100
        a = 0.27
        b = 1.2
        x = np.concatenate([np.arange(0, n), np.arange(0, n)])
        r = np.random.normal(0, 1, n * 2)
        y = (a * x + b + r).reshape(n * 2, 1)
        features = x.reshape(n * 2, 1)
        features = np.concatenate([features, np.ones_like(features)], axis=1)
        state_keys = ["1"] * n + ["2"] * n
        df = self.spark.createDataFrame(
            [(state_keys[i], Vectors.dense(
                y[i]), Matrices.dense(1, 2, features[i]))
             for i in range(n * 2)],
            ["state_key", "measurement", "measurementModel"])

        mmaeFilter = LinearKalmanFilter(2, 1)\
            .setStateKeyCol("state_key")\
            .setMeasurementModelCol("measurementModel")\
            .setMeasurementCol("measurement")\
            .setInitialCovariance(Matrices.dense(2, 2, (np.eye(2)*10).reshape(4, 1)))\
            .setProcessModel(Matrices.dense(2, 2, np.eye(2).reshape(4, 1)))\
            .setProcessNoise(Matrices.dense(2, 2, np.zeros(4)))\
            .setMeasurementNoise(Matrices.dense(1, 1, [1.0]))\
            .setSlidingLikelihoodWindow(5)\
            .setEnableMultipleModelAdaptiveEstimation()

        model = mmaeFilter.transform(df)
        state = model.filter(
            "stateIndex = {}".format(n)).collect()[0].state.values

        expected, _, _, _ = np.linalg.lstsq(features, y, rcond=None)
        np.testing.assert_array_almost_equal(state,
                                             expected.reshape(2),
                                             decimal=0)
Ejemplo n.º 3
0
 def test_matrix_multiply(self):
     mat1 = Matrices.dense(2, 2, [1.0, 2.0, 3.0, 4.0])
     mat2 = Matrices.dense(2, 2, [4.0, 5.0, 6.0, 7.0])
     result = self.spark.createDataFrame([(mat1, mat2)], ["mat1", "mat2"])\
         .withColumn("result", multiplyMatrix("mat1", "mat2"))\
         .select("result").head().result
     np.testing.assert_array_almost_equal(
         result.toArray(), np.dot(mat1.toArray(), mat2.toArray()))
Ejemplo n.º 4
0
 def test_matrix_projection(self):
     mat = Matrices.dense(2, 2, [1.0, 2.0, 3.0, 4.0])
     proj = Matrices.dense(1, 2, [4.0, 5.0])
     result = self.spark.createDataFrame([(mat, proj)], ["mat", "proj"]) \
         .withColumn("result", projectMatrix("mat", "proj")) \
         .select("result").head().result
     expected = np.dot(np.dot(proj.toArray(), mat.toArray()),
                       proj.toArray().transpose())
     np.testing.assert_array_almost_equal(result.toArray(), expected)
Ejemplo n.º 5
0
    def test_multinomial_logistic_regression_with_bound(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lor = LogisticRegression(
            regParam=0.01,
            lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
            upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0),
        )
        model = lor.fit(df)
        expected = [
            [4.593, 4.5516, 9.0099, 12.2904],
            [1.0, 8.1093, 7.0, 10.0],
            [3.041, 5.0, 8.0, 11.0],
        ]
        for i in range(0, len(expected)):
            self.assertTrue(
                np.allclose(model.coefficientMatrix.toArray()[i],
                            expected[i],
                            atol=1e-4))
        self.assertTrue(
            np.allclose(model.interceptVector.toArray(),
                        [-0.9057, -1.1392, -0.0033],
                        atol=1e-4))
Ejemplo n.º 6
0
 def test_matrix_array(self):
     mat = Matrices.dense(2, 2, [1.0, 2.0, 3.0, 4.0])
     arr = self.spark.createDataFrame([(mat,)], ["mat"]) \
         .withColumn("arr", matrixToArray("mat")) \
         .select("arr").head().arr
     np.testing.assert_array_almost_equal(
         np.array(arr.values),
         mat.toArray().reshape(4, order="F"))
Ejemplo n.º 7
0
 def test_matrix_vector_multiply(self):
     mat = Matrices.dense(2, 2, [1.0, 2.0, 3.0, 4.0])
     vec = Vectors.dense(1.0, 2.0)
     result = self.spark.createDataFrame([(mat, vec)], ["mat", "vec"]) \
         .withColumn("result", multiplyMatrixVector("mat", "vec")) \
         .select("result").head().result
     np.testing.assert_array_almost_equal(
         result.toArray(), np.dot(mat.toArray(), vec.toArray()))
Ejemplo n.º 8
0
def do_spark():
    #sparky = SparkSession.builder.getOrCreate()
    #df = sparky.sql('''select 'spark' as hello ''')

    v = Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)

    #dm1 = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])
    #dm2 = Matrices.dense(2, 3, [7, 8, 9, 10, 11, 12])

    #mat = sc.io.mmread('494_bus.mtx')

    #Something bigger with matrices
    n = 50
    s1 = np.random.normal(0, 1, n * n)
    s2 = np.random.normal(0, 1, n * n)
    dm3 = Matrices.dense(n, n, s1)
    dm4 = Matrices.dense(n, n, s2)

    dm3s = dm3.toSparse()
    dm4s = dm4.toSparse()

    return "{}".format(v.norm(2))
Ejemplo n.º 9
0
    def test_binomial_logistic_regression_with_bound(self):

        df = self.spark.createDataFrame(
            [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
             (0.0, 2.0, Vectors.dense(1.0, 2.0)),
             (1.0, 3.0, Vectors.dense(2.0, 1.0)),
             (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])

        lor = LogisticRegression(regParam=0.01, weightCol="weight",
                                 lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0))
        model = lor.fit(df)
        self.assertTrue(
            np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
Ejemplo n.º 10
0
    def test_persistance(self):
        filter = LinearKalmanFilter() \
            .setStateSize(2) \
            .setInitialStateMean(Vectors.dense([0.0, 0.0])) \
            .setInitialStateCovariance(Matrices.dense(2, 2, [1.0, 0.0, 0.0, 0.0]))\

        path = tempfile.mkdtemp()
        model_path = os.path.join(path, "lkf")
        filter.save(model_path)

        loaded = LinearKalmanFilter.load(model_path)
        assert (loaded.getInitialStateMean() == filter.getInitialStateMean())
        assert (loaded.getInitialStateCovariance() ==
                filter.getInitialStateCovariance())
        assert (loaded.getStateSize() == filter.getStateSize())
Ejemplo n.º 11
0
    def test_batch_save_and_resume(self):
        n = 100
        ts = np.arange(0, n)
        zs = np.random.normal(0, 1, n) + ts

        split_point = n // 2
        initial = zs[:split_point]
        remaining = zs[split_point:]

        filter = LinearKalmanFilter()\
            .setMeasurementCol("measurement")\
            .setInitialStateMean(
                Vectors.dense([0.0, 0.0]))\
            .setInitialStateCovariance(
                Matrices.dense(2, 2, [1, 0, 0, 1]))\
            .setProcessModel(
                Matrices.dense(2, 2, [1, 0, 1, 1]))\
            .setProcessNoise(
                Matrices.dense(2, 2, [0.01, 0.0, 0.0, 0.01]))\
            .setMeasurementNoise(
                Matrices.dense(1, 1, [1]))\
            .setMeasurementModel(
                Matrices.dense(1, 2, [1, 0]))

        initial_filter = filter.setInitialStateCovariance(
            Matrices.dense(2, 2, [1000.0, 0.0, 0.0, 1000.0]))

        def create_df(m):
            return self.spark.createDataFrame([(Vectors.dense(m[i]), )
                                               for i in range(len(m))],
                                              ["measurement"])

        initial_measurements = create_df(initial)

        complete_measurements = create_df(zs)

        initial_state = initial_filter.transform(initial_measurements)\
            .filter("stateIndex == {}".format(len(initial)))\
            .select("stateKey", "state")

        complete_state = initial_filter.transform(complete_measurements) \
            .filter("stateIndex == {}".format(len(zs)))\
            .select("stateKey", "state")

        restarted_filter = filter\
            .setInitialStateDistributionCol("state")

        remaining_measurements = create_df(remaining)\
            .crossJoin(initial_state)

        restarted_state = restarted_filter.transform(remaining_measurements)\
            .filter("stateIndex == {}".format(n - split_point))\
            .select("stateKey", "state")

        assert (restarted_state.collect() == complete_state.collect())
Ejemplo n.º 12
0
    def test_multinomial_logistic_regression_with_bound(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lor = LogisticRegression(regParam=0.01,
                                 lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
        model = lor.fit(df)
        expected = [[4.593, 4.5516, 9.0099, 12.2904],
                    [1.0, 8.1093, 7.0, 10.0],
                    [3.041, 5.0, 8.0, 11.0]]
        for i in range(0, len(expected)):
            self.assertTrue(
                np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
        self.assertTrue(
            np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
Ejemplo n.º 13
0
    mps = int(sys.argv[2])

    spark = SparkSession.builder.appName("RateSourceLKF").getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    noise_param = 1

    input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\
        .withColumn("mod", F.col("value") % num_states)\
        .withColumn("stateKey", F.col("mod").cast("String"))\
        .withColumn("trend", (F.col("value")/num_states).cast("Integer") + F.randn() * noise_param)

    lkf = LinearKalmanFilter(2, 1)\
        .setStateKeyCol("stateKey")\
        .setMeasurementCol("measurement")\
        .setInitialCovariance(Matrices.dense(2, 2, [10000.0, 0.0, 0.0, 10000.0]))\
        .setProcessModel(Matrices.dense(2, 2, [1.0, 0.0, 1.0, 1.0]))\
        .setProcessNoise(Matrices.dense(2, 2, [0.0001, 0.0, 0.0, 0.0001]))\
        .setMeasurementNoise(Matrices.dense(1, 1, [noise_param]))\
        .setMeasurementModel(Matrices.dense(1, 2, [1.0, 0.0]))

    assembler = VectorAssembler(inputCols=["trend"], outputCol="measurement")

    measurements = assembler.transform(input_df)
    query = lkf.transform(measurements)\
        .writeStream\
        .queryName("RateSourceLKF")\
        .outputMode("append")\
        .format("console")\
        .start()
Ejemplo n.º 14
0
    num_states = int(sys.argv[1])
    mps = int(sys.argv[2])

    spark = SparkSession.builder.appName("LKFRateSourceOLS").getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    # OLS problem, states to be estimated are a, b and c
    # z = a*x + b * y + c + w, where w ~ N(0, 1)
    a = 0.5
    b = 0.2
    c = 1.2
    noise_param = 1

    label_udf = F.udf(lambda x, y, w: Vectors.dense([x * a + y * b + c + w]),
                      VectorUDT())
    features_udf = F.udf(lambda x, y: Matrices.dense(1, 3, [x, y, 1]),
                         MatrixUDT())

    features = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\
        .withColumn("mod", F.col("value") % num_states)\
        .withColumn("stateKey", F.col("mod").cast("String"))\
        .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\
        .withColumn("y", F.sqrt("x"))\
        .withColumn("w", F.randn(0) * noise_param)\
        .withColumn("label", label_udf("x", "y", "w"))\
        .withColumn("features", features_udf("x", "y"))

    lkf = LinearKalmanFilter()\
        .setStateKeyCol("stateKey")\
        .setMeasurementCol("label")\
        .setMeasurementModelCol("features") \