def test_spark_ml_matrix(self): df = self.spark.createDataFrame([ { "name": 1, "mat": DenseMatrix(2, 2, range(4)) }, { "name": 2, "mat": DenseMatrix(3, 3, range(9)) }, ]) df.write.mode("overwrite").format("rikai").save(self.test_dir) df.show() records = sorted(self._read_parquets(self.test_dir), key=lambda x: x["name"]) expected = [ { "name": 1, "mat": np.array(range(4), dtype=np.float64).reshape(2, 2).T }, { "name": 2, "mat": np.array(range(9), dtype=np.float64).reshape(3, 3).T }, ] for exp, rec in zip(expected, records): self.assertEqual(exp["name"], rec["name"]) self.assertTrue(np.array_equal(exp["mat"], rec["mat"]))
def testInitialBiasAndWeightsAffectResult(prostateDataset): [traningDataset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 1) def createInitialDeepLearningDefinition(): return H2ODeepLearning(seed=42, reproducible=True, labelCol="CAPSULE", featuresCols=["AGE", "RACE", "DPROS", "DCAPS"], hidden=[ 3, ]) referenceDeepLearning = createInitialDeepLearningDefinition() referenceModel = referenceDeepLearning.fit(traningDataset) referenceResult = referenceModel.transform(testingDataset) deepLearning = createInitialDeepLearningDefinition() matrix0 = DenseMatrix(3, 4, [.1, .2, .3, .4, .4, .5, .6, .7, .7, .8, .9, .6], False) matrix1 = DenseMatrix(1, 3, [.2, .3, .4], False) deepLearning.setInitialWeights([matrix0, matrix1]) deepLearning.setInitialBiases( [DenseVector([.1, .2, .3]), DenseVector([.1])]) model = deepLearning.fit(traningDataset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_have_different_values( referenceResult, result)
class MatrixUDTTests(MLlibTestCase): dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0]) sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True) udt = MatrixUDT() def test_json_schema(self): self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for m in [self.dm1, self.dm2, self.sm1, self.sm2]: self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m))) def test_infer_schema(self): rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)]) df = rdd.toDF() schema = df.schema self.assertTrue(schema.fields[1].dataType, self.udt) matrices = df.rdd.map(lambda x: x._2).collect() self.assertEqual(len(matrices), 2) for m in matrices: if isinstance(m, DenseMatrix): self.assertTrue(m, self.dm1) elif isinstance(m, SparseMatrix): self.assertTrue(m, self.sm1) else: raise ValueError("Expected a matrix but got type %r" % type(m))
def test_spark_ml_matrix(spark: SparkSession, tmp_path: Path): test_dir = str(tmp_path) df = spark.createDataFrame([ { "name": 1, "mat": DenseMatrix(2, 2, range(4)) }, { "name": 2, "mat": DenseMatrix(3, 3, range(9)) }, ]) df.write.mode("overwrite").format("rikai").save(test_dir) df.show() records = sorted(_read_parquets(test_dir), key=lambda x: x["name"]) expected = [ { "name": 1, "mat": np.array(range(4), dtype=np.float64).reshape(2, 2).T, }, { "name": 2, "mat": np.array(range(9), dtype=np.float64).reshape(3, 3).T, }, ] for exp, rec in zip(expected, records): assert exp["name"] == rec["name"] assert np.array_equal(exp["mat"], rec["mat"])
def test_repr_dense_matrix(self): mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) self.assertTrue(repr(mat), "DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)") mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True) self.assertTrue(repr(mat), "DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)") mat = DenseMatrix(6, 3, zeros(18)) self.assertTrue( repr(mat), "DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)", )
def test_dense_matrix_is_transposed(self): mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True) mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9]) self.assertEqual(mat1, mat) expected = [[0, 4], [1, 6], [3, 9]] for i in range(3): for j in range(2): self.assertEqual(mat1[i, j], expected[i][j]) self.assertTrue(array_equal(mat1.toArray(), expected)) sm = mat1.toSparse() self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2])) self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5])) self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
def test_dense_matrix_is_transposed(self): mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True) mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9]) self.assertEqual(mat1, mat) expected = [[0, 4], [1, 6], [3, 9]] for i in range(3): for j in range(2): self.assertEqual(mat1[i, j], expected[i][j]) self.assertTrue(array_equal(mat1.toArray(), expected)) sm = mat1.toSparse() self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2])) self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5])) self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
def all_time_mat(matrices): """ Input a list of matrices for a crime type inside a district, a day of the week during an hour Stack them together in order to get statistics from all available year later on """ try: if len(list(matrices)) < 1: return None mat_array = [m.toArray() for m in list(matrices)] years = [mat_array[0][-1,0]] for yearly_matrix in mat_array[1:]: year = yearly_matrix[-1,0] if year in years: return None #Should be one matrix per year if done correctly max_len = max([arr.shape[1] for arr in mat_array]) stacked_padded_matrix = np.array([np.lib.pad(arr, ((0,0), (0, max_len - arr.shape[1])), 'constant', constant_values=5000) for arr in mat_array]) stacked_padded_matrix = stacked_padded_matrix.reshape(-1,max_len) return DenseMatrix(numRows = stacked_padded_matrix.shape[0], numCols = stacked_padded_matrix.shape[1], values = stacked_padded_matrix.flatten(), isTransposed=True) except: return None
def test_convert_matrix(spark): str_list = ['a', 'b'] df = spark.createDataFrame(str_list, StringType()) ndarray = np.array([[1.0, 2.1, 3.2], [4.3, 5.4, 6.5]]) output_rows = df.withColumn("matrix", lit(ndarray)).collect() expected_matrix = DenseMatrix(2, 3, [1.0, 4.3, 2.1, 5.4, 3.2, 6.5]) assert (output_rows[0].matrix == expected_matrix) assert (output_rows[1].matrix == expected_matrix)
def test_matrix_indexing(self): mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) expected = [[0, 6], [1, 8], [4, 10]] for i in range(3): for j in range(2): self.assertEqual(mat[i, j], expected[i][j]) for i, j in [(-1, 0), (4, 1), (3, 4)]: self.assertRaises(IndexError, mat.__getitem__, (i, j))
def test_serialize(self): self._test_serialize(DenseVector(range(10))) self._test_serialize(DenseVector(array([1.0, 2.0, 3.0, 4.0]))) self._test_serialize(DenseVector(pyarray.array("d", range(10)))) self._test_serialize(SparseVector(4, {1: 1, 3: 2})) self._test_serialize(SparseVector(3, {})) self._test_serialize(DenseMatrix(2, 3, range(6))) sm1 = SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) self._test_serialize(sm1)
def save_corr_heatmap(corr: DenseMatrix, columns: List[str], path: str, title="Correlation Matrix"): rows = corr.toArray().tolist() df = pd.DataFrame(rows) fig = plt.figure(figsize=(13, 8)) sns.heatmap(df, xticklabels=columns, yticklabels=columns, annot=True) plt.title(title) plt.savefig(path) plt.close()
def setRegularizationMatrixFactor(self, value): """ Sets the regularization matrix with a float factor, which results in setting the regularization matrix as factor * identity :param value: Float :return: RecursiveLeastSquaresFilter """ regMat = np.eye(self._featuresSize) * value rows, cols = regMat.shape return self._set(regularizationMatrix=DenseMatrix(rows, cols, regMat.reshape(rows * cols, order="F")))
def yearly_dayofweek_hour_matrix(doy_ar, dow_ar, hr_ar, hc_ar, dow, hr, year): """ Input is all crimes of a certain type for a year, within a district E.g. All Narcotics crimes of 2018 in District 09 params: doy_ar: Day of the year array dow_ar: Day of the week array hr_ar: Hour array hc_ar: Hour count array arrays should be of the same length. By stacking them on top of each other in a matrix, we get the day of the year, day the week, hour and amount of a certain crime by filtering on an index (column) doy: Day of the year for the incoming row dow: Day of the week value for the incoming row year: Year of the incoming row We should use this to filter the matrix until it only contains crimes occuring on the same day of the week, with +- 1 hour returns: A dense, but filtered matrix containing only relevant crimes for this input shape: (4, num crimes) Example: [4,20,55], (Day of the year) [2,4,2], (Day of the week) [22,21,23] (Hour) [2,1,1] (Count) Filter on crimes at mondays 23:00 -> column 0 and 2 Filter on crimes at mondays 00:00 -> column 2 Filter on crimes at wednesdays 20:00 -> column 1 """ dense_mat = np.matrix([doy_ar, dow_ar, hr_ar, hc_ar]) #Dense matrix with all crimes that year dow_filter = dow_filter = dense_mat[:, np.array( dense_mat[1, :] == dow).flatten( )] #Only the same day of the week #+-1 hour hr_filtered = dow_filter[:, np.isin( np.array(dow_filter[2, :]).flatten( ), np.array([(hr - 1) % 24, hr, (hr + 1) % 24]))] #Last column stores what we filtered on vals = np.append(np.array(hr_filtered), np.full(hr_filtered.shape[1], year).reshape(1, -1), axis=0) return DenseMatrix(numRows=5, numCols=vals.shape[1], values=vals.flatten(), isTransposed=True)
def test_readme_example(spark: SparkSession): df = spark.createDataFrame([{ "id": 1, "mat": DenseMatrix(2, 2, range(4)), "image": Image("s3://foo/bar/1.png"), "annotations": [ Row( label="cat", mask=wrap(np.random.rand(256, 256)), bbox=Box2d(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0), ) ], }]) df.show()
def test_eq(self): v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) dm1 = DenseMatrix(2, 2, [2, 0, 0, 0]) sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2]) self.assertEqual(v1, v2) self.assertEqual(v1, v3) self.assertFalse(v2 == v4) self.assertFalse(v1 == v5) self.assertFalse(v1 == v6) # this is done as Dense and Sparse matrices can be semantically # equal while still implementing a different __eq__ method self.assertEqual(dm1, sm1) self.assertEqual(sm1, dm1)
np.random.seed(0) g = np.array([0., 1., 2., 0.]) x = np.array([ [1, -1], [2, -2], [3, -3], [4, -4.], ]) b = np.array([0., 1.]) y = g + np.dot(x, b) + np.random.normal(scale=.01, size=g.size) HR = '-' * 50 print(HR) print('Version 1') # Correct version dm = DenseMatrix(numRows=x.shape[0], numCols=x.shape[1], values=x.ravel(order='F').tolist()) np.testing.assert_equal(x, dm.toArray()) print(dm.toArray()) spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist(), covariates=dm)])\ .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', 'covariates')))\ .show() print(HR) print('Version 2') # Version also like demo notebook with explicit matrix field (also wrong) dm = DenseMatrix(numRows=x.shape[0], numCols=x.shape[1], values=x.ravel(order='C').tolist()) print(dm.toArray()) spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist(), covariates=dm)])\ .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', 'covariates')))\ .show()
# rows = sc.parallelize([ # Vectors.sparse(5, {1: 1.0, 3: 7.0}), # Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), # Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) # ]) # mat = RowMatrix(rows) mat_orig_data = RowMatrix(orig_2D_matrix.map(lambda x: Row(x[1], x[0]))) # Compute the top 4 principal components. # Principal components are stored in a local dense matrix. # pc = mat_FROM_SVD.computePrincipalComponents(2) # Project the rows to the linear space spanned by the top 4 principal components. # projected = mat.multiply(pc) # princ_comps_READY = DenseMatrix(len(princ_comps[0]), len(princ_comps), princ_comps.tolist()) princ_comps_READY = DenseMatrix(15, 2, princ_comps.tolist()) projected = mat_orig_data.multiply(princ_comps) projected.rows.map(lambda x: (x, )).toDF().show() # # spark = SparkSession.Builder().getOrCreate() # # # # ------------ # from pyspark.mllib.linalg import Vectors # from pyspark.mllib.linalg.distributed import RowMatrix # # rows = spark.sparkContext.parallelize([ # Vectors.sparse(5, {1: 1.0, 3: 7.0}), # Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), # Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)