Example #1
0
class MatrixUDTTests(MLlibTestCase):

    dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
    dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
    sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
    sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
    udt = MatrixUDT()

    def test_json_schema(self):
        self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
            self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
        df = rdd.toDF()
        schema = df.schema
        self.assertTrue(schema.fields[1].dataType, self.udt)
        matrices = df.rdd.map(lambda x: x._2).collect()
        self.assertEqual(len(matrices), 2)
        for m in matrices:
            if isinstance(m, DenseMatrix):
                self.assertTrue(m, self.dm1)
            elif isinstance(m, SparseMatrix):
                self.assertTrue(m, self.sm1)
            else:
                raise ValueError("Expected a matrix but got type %r" % type(m))
Example #2
0
 def test_json_schema(self):
     self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
Example #3
0
 def test_json_schema(self):
     self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
Example #4
0
        F.col("District").alias("jDistrict"),
        F.col("Year").alias("jYear"),
        F.col("hc").alias("hc"),
        F.col("dow").alias("dow"),
        F.col("doy").alias("doy"),
        F.col("hr").alias("hr"))

tmp = tmp.join(join_df,\
              ([join_df.jy == df.y,\
                join_df.jDistrict == df.District,
                join_df.jYear == df.Year]),\
              how='left')\
.drop("jYear","jDistrict","jy")


@udf(MatrixUDT())
def yearly_dayofweek_hour_matrix(doy_ar, dow_ar, hr_ar, hc_ar, dow, hr, year):
    """
    Input is all crimes of a certain type for a year, within a district
    E.g. All Narcotics crimes of 2018 in District 09
    
    params:
    doy_ar: Day of the year array
    dow_ar: Day of the week array
    hr_ar: Hour array
    hc_ar: Hour count array
    arrays should be of the same length. By stacking them on top of each other in a matrix, we get
    the day of the year, day the week, hour and amount of a certain crime by filtering on an index (column)
    
    doy: Day of the year for the incoming row
    dow: Day of the week value for the incoming row
Example #5
0
    mps = int(sys.argv[2])

    spark = SparkSession.builder.appName("LKFRateSourceOLS").getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    # OLS problem, states to be estimated are a, b and c
    # z = a*x + b * y + c + w, where w ~ N(0, 1)
    a = 0.5
    b = 0.2
    c = 1.2
    noise_param = 1

    label_udf = F.udf(lambda x, y, w: Vectors.dense([x * a + y * b + c + w]),
                      VectorUDT())
    features_udf = F.udf(lambda x, y: Matrices.dense(1, 3, [x, y, 1]),
                         MatrixUDT())

    features = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\
        .withColumn("mod", F.col("value") % num_states)\
        .withColumn("stateKey", F.col("mod").cast("String"))\
        .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\
        .withColumn("y", F.sqrt("x"))\
        .withColumn("w", F.randn(0) * noise_param)\
        .withColumn("label", label_udf("x", "y", "w"))\
        .withColumn("features", features_udf("x", "y"))

    lkf = LinearKalmanFilter()\
        .setStateKeyCol("stateKey")\
        .setMeasurementCol("label")\
        .setMeasurementModelCol("features") \
        .setInitialStateMean(Vectors.dense([0.0, 0.0, 0.0]))\