class MatrixUDTTests(MLlibTestCase): dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0]) sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True) udt = MatrixUDT() def test_json_schema(self): self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for m in [self.dm1, self.dm2, self.sm1, self.sm2]: self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m))) def test_infer_schema(self): rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)]) df = rdd.toDF() schema = df.schema self.assertTrue(schema.fields[1].dataType, self.udt) matrices = df.rdd.map(lambda x: x._2).collect() self.assertEqual(len(matrices), 2) for m in matrices: if isinstance(m, DenseMatrix): self.assertTrue(m, self.dm1) elif isinstance(m, SparseMatrix): self.assertTrue(m, self.sm1) else: raise ValueError("Expected a matrix but got type %r" % type(m))
def test_json_schema(self): self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
F.col("District").alias("jDistrict"), F.col("Year").alias("jYear"), F.col("hc").alias("hc"), F.col("dow").alias("dow"), F.col("doy").alias("doy"), F.col("hr").alias("hr")) tmp = tmp.join(join_df,\ ([join_df.jy == df.y,\ join_df.jDistrict == df.District, join_df.jYear == df.Year]),\ how='left')\ .drop("jYear","jDistrict","jy") @udf(MatrixUDT()) def yearly_dayofweek_hour_matrix(doy_ar, dow_ar, hr_ar, hc_ar, dow, hr, year): """ Input is all crimes of a certain type for a year, within a district E.g. All Narcotics crimes of 2018 in District 09 params: doy_ar: Day of the year array dow_ar: Day of the week array hr_ar: Hour array hc_ar: Hour count array arrays should be of the same length. By stacking them on top of each other in a matrix, we get the day of the year, day the week, hour and amount of a certain crime by filtering on an index (column) doy: Day of the year for the incoming row dow: Day of the week value for the incoming row
mps = int(sys.argv[2]) spark = SparkSession.builder.appName("LKFRateSourceOLS").getOrCreate() spark.sparkContext.setLogLevel("WARN") # OLS problem, states to be estimated are a, b and c # z = a*x + b * y + c + w, where w ~ N(0, 1) a = 0.5 b = 0.2 c = 1.2 noise_param = 1 label_udf = F.udf(lambda x, y, w: Vectors.dense([x * a + y * b + c + w]), VectorUDT()) features_udf = F.udf(lambda x, y: Matrices.dense(1, 3, [x, y, 1]), MatrixUDT()) features = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\ .withColumn("mod", F.col("value") % num_states)\ .withColumn("stateKey", F.col("mod").cast("String"))\ .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\ .withColumn("y", F.sqrt("x"))\ .withColumn("w", F.randn(0) * noise_param)\ .withColumn("label", label_udf("x", "y", "w"))\ .withColumn("features", features_udf("x", "y")) lkf = LinearKalmanFilter()\ .setStateKeyCol("stateKey")\ .setMeasurementCol("label")\ .setMeasurementModelCol("features") \ .setInitialStateMean(Vectors.dense([0.0, 0.0, 0.0]))\