def within_group_scatter(data: pyspark.sql.DataFrame, features, response, targets): p = len(features) sw = numpy.zeros((p, p)) for target in targets: df_t = data.filter("{} == '{}'".format(response, target)) X_t = RowMatrix(df_t.select(features).rdd.map(numpy.array)) sw += X_t.computeCovariance().toArray() * (df_t.count() - 1) return sw
def get_gmm(self, k, sample_fraction=None, retry=True): if k == 1: if sample_fraction: data = self.mllib_training_data.sample(False, sample_fraction) else: data = self.mllib_training_data row_matrix = RowMatrix(data) mean = row_matrix.computeColumnSummaryStatistics().mean() cov = row_matrix.computeCovariance().toArray() weights = [1.0] gaussians = [Gaussian(mean, cov)] log_likelihood = None else: m = self.fit_ml_model(k, sample_fraction=sample_fraction, retry=retry) weights = m.weights gaussians = [ Gaussian(g.mean, g.cov.toArray()) for g in m.gaussiansDF.collect() ] log_likelihood = m.summary.logLikelihood return GaussianMixtureModel(weights, gaussians, log_likelihood)
#matrix = Matrices.dense(nrows, ncols, rdd) print("ncol: %d, nrow %d" % (ncols, nrows)) coord_mat = CoordinateMatrix(rdd.map(tuple)) print("num rows in matrix %d" % coord_mat.numRows()) print("finished using pyspark") #________________________________________________- print("now use SparkSession") from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() df_2 = spark.read.option("delimiter", " ").csv('./data/lpi_ceria3d_b.mtx', header=False, inferSchema=True) df_2.printSchema() #coord_mat_2 = CoordinateMatrix(df_2.rdd.map(tuple)) row_mat = RowMatrix(df_2.rdd.map(tuple)) print("num rows in row matrix %d, num_cols %d" % (row_mat.numRows(), row_mat.numCols())) print("print covariance") print(row_mat.computeCovariance()) dm = Matrices.dense(3, 1, [4, 5, 6]) print("multiply row Matrix") result = row_mat.multiply(dm)