Esempio n. 1
0
    def test_spark_ml_matrix(self):
        df = self.spark.createDataFrame([
            {
                "name": 1,
                "mat": DenseMatrix(2, 2, range(4))
            },
            {
                "name": 2,
                "mat": DenseMatrix(3, 3, range(9))
            },
        ])
        df.write.mode("overwrite").format("rikai").save(self.test_dir)
        df.show()

        records = sorted(self._read_parquets(self.test_dir),
                         key=lambda x: x["name"])

        expected = [
            {
                "name": 1,
                "mat": np.array(range(4), dtype=np.float64).reshape(2, 2).T
            },
            {
                "name": 2,
                "mat": np.array(range(9), dtype=np.float64).reshape(3, 3).T
            },
        ]
        for exp, rec in zip(expected, records):
            self.assertEqual(exp["name"], rec["name"])
            self.assertTrue(np.array_equal(exp["mat"], rec["mat"]))
def testInitialBiasAndWeightsAffectResult(prostateDataset):
    [traningDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                   1)

    def createInitialDeepLearningDefinition():
        return H2ODeepLearning(seed=42,
                               reproducible=True,
                               labelCol="CAPSULE",
                               featuresCols=["AGE", "RACE", "DPROS", "DCAPS"],
                               hidden=[
                                   3,
                               ])

    referenceDeepLearning = createInitialDeepLearningDefinition()
    referenceModel = referenceDeepLearning.fit(traningDataset)
    referenceResult = referenceModel.transform(testingDataset)

    deepLearning = createInitialDeepLearningDefinition()
    matrix0 = DenseMatrix(3, 4,
                          [.1, .2, .3, .4, .4, .5, .6, .7, .7, .8, .9, .6],
                          False)
    matrix1 = DenseMatrix(1, 3, [.2, .3, .4], False)
    deepLearning.setInitialWeights([matrix0, matrix1])
    deepLearning.setInitialBiases(
        [DenseVector([.1, .2, .3]),
         DenseVector([.1])])
    model = deepLearning.fit(traningDataset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_have_different_values(
        referenceResult, result)
Esempio n. 3
0
class MatrixUDTTests(MLlibTestCase):

    dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
    dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
    sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
    sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
    udt = MatrixUDT()

    def test_json_schema(self):
        self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
            self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
        df = rdd.toDF()
        schema = df.schema
        self.assertTrue(schema.fields[1].dataType, self.udt)
        matrices = df.rdd.map(lambda x: x._2).collect()
        self.assertEqual(len(matrices), 2)
        for m in matrices:
            if isinstance(m, DenseMatrix):
                self.assertTrue(m, self.dm1)
            elif isinstance(m, SparseMatrix):
                self.assertTrue(m, self.sm1)
            else:
                raise ValueError("Expected a matrix but got type %r" % type(m))
Esempio n. 4
0
def test_spark_ml_matrix(spark: SparkSession, tmp_path: Path):
    test_dir = str(tmp_path)
    df = spark.createDataFrame([
        {
            "name": 1,
            "mat": DenseMatrix(2, 2, range(4))
        },
        {
            "name": 2,
            "mat": DenseMatrix(3, 3, range(9))
        },
    ])
    df.write.mode("overwrite").format("rikai").save(test_dir)
    df.show()

    records = sorted(_read_parquets(test_dir), key=lambda x: x["name"])

    expected = [
        {
            "name": 1,
            "mat": np.array(range(4), dtype=np.float64).reshape(2, 2).T,
        },
        {
            "name": 2,
            "mat": np.array(range(9), dtype=np.float64).reshape(3, 3).T,
        },
    ]
    for exp, rec in zip(expected, records):
        assert exp["name"] == rec["name"]
        assert np.array_equal(exp["mat"], rec["mat"])
Esempio n. 5
0
    def test_repr_dense_matrix(self):
        mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
        self.assertTrue(repr(mat), "DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)")

        mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True)
        self.assertTrue(repr(mat), "DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)")

        mat = DenseMatrix(6, 3, zeros(18))
        self.assertTrue(
            repr(mat),
            "DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)",
        )
Esempio n. 6
0
    def test_dense_matrix_is_transposed(self):
        mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
        mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
        self.assertEqual(mat1, mat)

        expected = [[0, 4], [1, 6], [3, 9]]
        for i in range(3):
            for j in range(2):
                self.assertEqual(mat1[i, j], expected[i][j])
        self.assertTrue(array_equal(mat1.toArray(), expected))

        sm = mat1.toSparse()
        self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
        self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
        self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
Esempio n. 7
0
    def test_dense_matrix_is_transposed(self):
        mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
        mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
        self.assertEqual(mat1, mat)

        expected = [[0, 4], [1, 6], [3, 9]]
        for i in range(3):
            for j in range(2):
                self.assertEqual(mat1[i, j], expected[i][j])
        self.assertTrue(array_equal(mat1.toArray(), expected))

        sm = mat1.toSparse()
        self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
        self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
        self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
Esempio n. 8
0
def all_time_mat(matrices):
    """
    Input a list of matrices for a crime type inside a district, a day of the week during an hour
    Stack them together in order to get statistics from all available year later on 
    """
    try:
        if len(list(matrices)) < 1:
            return None

        mat_array = [m.toArray() for m in list(matrices)]
        years = [mat_array[0][-1,0]]
        for yearly_matrix in mat_array[1:]:
            year = yearly_matrix[-1,0]
            if year in years:
                return None #Should be one matrix per year if done correctly
            
        max_len = max([arr.shape[1] for arr in mat_array])
        stacked_padded_matrix = np.array([np.lib.pad(arr, ((0,0), (0, max_len - arr.shape[1])),
                                                     'constant', constant_values=5000) for arr in mat_array])
        stacked_padded_matrix = stacked_padded_matrix.reshape(-1,max_len)
        return DenseMatrix(numRows = stacked_padded_matrix.shape[0],
                           numCols = stacked_padded_matrix.shape[1],
                           values = stacked_padded_matrix.flatten(),
                           isTransposed=True)
    except:
        return None
Esempio n. 9
0
def test_convert_matrix(spark):
    str_list = ['a', 'b']
    df = spark.createDataFrame(str_list, StringType())
    ndarray = np.array([[1.0, 2.1, 3.2], [4.3, 5.4, 6.5]])
    output_rows = df.withColumn("matrix", lit(ndarray)).collect()
    expected_matrix = DenseMatrix(2, 3, [1.0, 4.3, 2.1, 5.4, 3.2, 6.5])
    assert (output_rows[0].matrix == expected_matrix)
    assert (output_rows[1].matrix == expected_matrix)
Esempio n. 10
0
    def test_matrix_indexing(self):
        mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
        expected = [[0, 6], [1, 8], [4, 10]]
        for i in range(3):
            for j in range(2):
                self.assertEqual(mat[i, j], expected[i][j])

        for i, j in [(-1, 0), (4, 1), (3, 4)]:
            self.assertRaises(IndexError, mat.__getitem__, (i, j))
Esempio n. 11
0
 def test_serialize(self):
     self._test_serialize(DenseVector(range(10)))
     self._test_serialize(DenseVector(array([1.0, 2.0, 3.0, 4.0])))
     self._test_serialize(DenseVector(pyarray.array("d", range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
     self._test_serialize(SparseVector(3, {}))
     self._test_serialize(DenseMatrix(2, 3, range(6)))
     sm1 = SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
     self._test_serialize(sm1)
Esempio n. 12
0
def save_corr_heatmap(corr: DenseMatrix,
                      columns: List[str],
                      path: str,
                      title="Correlation Matrix"):
    rows = corr.toArray().tolist()
    df = pd.DataFrame(rows)
    fig = plt.figure(figsize=(13, 8))
    sns.heatmap(df, xticklabels=columns, yticklabels=columns, annot=True)
    plt.title(title)
    plt.savefig(path)
    plt.close()
    def setRegularizationMatrixFactor(self, value):
        """
        Sets the regularization matrix with a float factor, which results in setting the regularization matrix as
        factor * identity

        :param value: Float
        :return: RecursiveLeastSquaresFilter
        """
        regMat = np.eye(self._featuresSize) * value
        rows, cols = regMat.shape
        return self._set(regularizationMatrix=DenseMatrix(rows, cols, regMat.reshape(rows * cols, order="F")))
Esempio n. 14
0
def yearly_dayofweek_hour_matrix(doy_ar, dow_ar, hr_ar, hc_ar, dow, hr, year):
    """
    Input is all crimes of a certain type for a year, within a district
    E.g. All Narcotics crimes of 2018 in District 09
    
    params:
    doy_ar: Day of the year array
    dow_ar: Day of the week array
    hr_ar: Hour array
    hc_ar: Hour count array
    arrays should be of the same length. By stacking them on top of each other in a matrix, we get
    the day of the year, day the week, hour and amount of a certain crime by filtering on an index (column)
    
    doy: Day of the year for the incoming row
    dow: Day of the week value for the incoming row
    year: Year of the incoming row
    We should use this to filter the matrix until it only contains crimes occuring on the same day
    of the week, with +- 1 hour 
    
    returns:
    A dense, but filtered matrix containing only relevant crimes for this input
    shape: (4, num crimes)
    Example: [4,20,55], (Day of the year)
             [2,4,2],  (Day of the week)
             [22,21,23] (Hour)
             [2,1,1]   (Count)
        
    Filter on crimes at mondays 23:00 -> column 0 and 2
    Filter on crimes at mondays 00:00 -> column 2
    Filter on crimes at wednesdays 20:00 -> column 1 
    """
    dense_mat = np.matrix([doy_ar, dow_ar, hr_ar,
                           hc_ar])  #Dense matrix with all crimes that year
    dow_filter = dow_filter = dense_mat[:,
                                        np.array(
                                            dense_mat[1, :] == dow).flatten(
                                            )]  #Only the same day of the week
    #+-1 hour
    hr_filtered = dow_filter[:,
                             np.isin(
                                 np.array(dow_filter[2, :]).flatten(
                                 ), np.array([(hr - 1) % 24, hr,
                                              (hr + 1) % 24]))]
    #Last column stores what we filtered on
    vals = np.append(np.array(hr_filtered),
                     np.full(hr_filtered.shape[1], year).reshape(1, -1),
                     axis=0)
    return DenseMatrix(numRows=5,
                       numCols=vals.shape[1],
                       values=vals.flatten(),
                       isTransposed=True)
Esempio n. 15
0
def test_readme_example(spark: SparkSession):
    df = spark.createDataFrame([{
        "id":
        1,
        "mat":
        DenseMatrix(2, 2, range(4)),
        "image":
        Image("s3://foo/bar/1.png"),
        "annotations": [
            Row(
                label="cat",
                mask=wrap(np.random.rand(256, 256)),
                bbox=Box2d(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0),
            )
        ],
    }])
    df.show()
Esempio n. 16
0
 def test_eq(self):
     v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
     v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
     v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
     v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
     dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
     sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
     self.assertEqual(v1, v2)
     self.assertEqual(v1, v3)
     self.assertFalse(v2 == v4)
     self.assertFalse(v1 == v5)
     self.assertFalse(v1 == v6)
     # this is done as Dense and Sparse matrices can be semantically
     # equal while still implementing a different __eq__ method
     self.assertEqual(dm1, sm1)
     self.assertEqual(sm1, dm1)
np.random.seed(0)
g = np.array([0., 1., 2., 0.])
x = np.array([
    [1, -1],
    [2, -2],
    [3, -3],
    [4, -4.],
])
b = np.array([0., 1.])
y = g + np.dot(x, b) + np.random.normal(scale=.01, size=g.size)

HR = '-' * 50
print(HR)
print('Version 1')
# Correct version
dm = DenseMatrix(numRows=x.shape[0], numCols=x.shape[1], values=x.ravel(order='F').tolist())
np.testing.assert_equal(x, dm.toArray())
print(dm.toArray())
spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist(), covariates=dm)])\
    .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', 'covariates')))\
    .show()

print(HR)
print('Version 2')
# Version also like demo notebook with explicit matrix field (also wrong)
dm = DenseMatrix(numRows=x.shape[0], numCols=x.shape[1], values=x.ravel(order='C').tolist())
print(dm.toArray())
spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist(), covariates=dm)])\
    .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', 'covariates')))\
    .show()
Esempio n. 18
0
# rows = sc.parallelize([
#     Vectors.sparse(5, {1: 1.0, 3: 7.0}),
#     Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
#     Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
# ])

# mat = RowMatrix(rows)
mat_orig_data = RowMatrix(orig_2D_matrix.map(lambda x: Row(x[1], x[0])))
# Compute the top 4 principal components.
# Principal components are stored in a local dense matrix.
# pc = mat_FROM_SVD.computePrincipalComponents(2)

# Project the rows to the linear space spanned by the top 4 principal components.
# projected = mat.multiply(pc)
# princ_comps_READY = DenseMatrix(len(princ_comps[0]), len(princ_comps), princ_comps.tolist())
princ_comps_READY = DenseMatrix(15, 2, princ_comps.tolist())
projected = mat_orig_data.multiply(princ_comps)
projected.rows.map(lambda x: (x, )).toDF().show()

#
# spark = SparkSession.Builder().getOrCreate()
#
#
# # ------------
# from pyspark.mllib.linalg import Vectors
# from pyspark.mllib.linalg.distributed import RowMatrix
#
# rows = spark.sparkContext.parallelize([
#     Vectors.sparse(5, {1: 1.0, 3: 7.0}),
#     Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
#     Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)