Python MatrixUDT Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.linalg

Class/Type: MatrixUDT

Examples at hotexamples.com: 5

Python MatrixUDT - 5 examples found. These are the top rated real world Python examples of pyspark.ml.linalg.MatrixUDT extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MatrixUDT(3)

fromJson(1)

Frequently Used Methods

MatrixUDT (3)

fromJson (1)

Example #1

Show file

class MatrixUDTTests(MLlibTestCase):

    dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
    dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
    sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
    sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
    udt = MatrixUDT()

    def test_json_schema(self):
        self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
            self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
        df = rdd.toDF()
        schema = df.schema
        self.assertTrue(schema.fields[1].dataType, self.udt)
        matrices = df.rdd.map(lambda x: x._2).collect()
        self.assertEqual(len(matrices), 2)
        for m in matrices:
            if isinstance(m, DenseMatrix):
                self.assertTrue(m, self.dm1)
            elif isinstance(m, SparseMatrix):
                self.assertTrue(m, self.sm1)
            else:
                raise ValueError("Expected a matrix but got type %r" % type(m))

Example #2

Show file

 def test_json_schema(self):
     self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)

Example #3

Show file

File: test_linalg.py Project: JkSelf/spark

 def test_json_schema(self):
     self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)

Example #4

Show file

        F.col("District").alias("jDistrict"),
        F.col("Year").alias("jYear"),
        F.col("hc").alias("hc"),
        F.col("dow").alias("dow"),
        F.col("doy").alias("doy"),
        F.col("hr").alias("hr"))

tmp = tmp.join(join_df,\
              ([join_df.jy == df.y,\
                join_df.jDistrict == df.District,
                join_df.jYear == df.Year]),\
              how='left')\
.drop("jYear","jDistrict","jy")


@udf(MatrixUDT())
def yearly_dayofweek_hour_matrix(doy_ar, dow_ar, hr_ar, hc_ar, dow, hr, year):
    """
    Input is all crimes of a certain type for a year, within a district
    E.g. All Narcotics crimes of 2018 in District 09
    
    params:
    doy_ar: Day of the year array
    dow_ar: Day of the week array
    hr_ar: Hour array
    hc_ar: Hour count array
    arrays should be of the same length. By stacking them on top of each other in a matrix, we get
    the day of the year, day the week, hour and amount of a certain crime by filtering on an index (column)
    
    doy: Day of the year for the incoming row
    dow: Day of the week value for the incoming row

Example #5

Show file

    mps = int(sys.argv[2])

    spark = SparkSession.builder.appName("LKFRateSourceOLS").getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    # OLS problem, states to be estimated are a, b and c
    # z = a*x + b * y + c + w, where w ~ N(0, 1)
    a = 0.5
    b = 0.2
    c = 1.2
    noise_param = 1

    label_udf = F.udf(lambda x, y, w: Vectors.dense([x * a + y * b + c + w]),
                      VectorUDT())
    features_udf = F.udf(lambda x, y: Matrices.dense(1, 3, [x, y, 1]),
                         MatrixUDT())

    features = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\
        .withColumn("mod", F.col("value") % num_states)\
        .withColumn("stateKey", F.col("mod").cast("String"))\
        .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\
        .withColumn("y", F.sqrt("x"))\
        .withColumn("w", F.randn(0) * noise_param)\
        .withColumn("label", label_udf("x", "y", "w"))\
        .withColumn("features", features_udf("x", "y"))

    lkf = LinearKalmanFilter()\
        .setStateKeyCol("stateKey")\
        .setMeasurementCol("label")\
        .setMeasurementModelCol("features") \
        .setInitialStateMean(Vectors.dense([0.0, 0.0, 0.0]))\