def sum_of_squared_errors(data: pyspark.sql.DataFrame): logger.info("Computing SSE") rdd = as_rdd_of_array(data) means = column_means(rdd) sse = (rdd.map(lambda x: (x - means).T.dot(x - means)).reduce( lambda x, y: x + y)) return sse
def setUpClass(cls): super().setUpClass() cls.log("Stats") iris = datasets.load_iris() cls._X = iris.data[:10, :4] cls._X = scale(cls._X) cls._features = ["sl", "sw", "pl", "pw"] df = pandas.DataFrame(data=cls._X, columns=cls._features) cls._spark_lo = cls.spark().createDataFrame(df) cls.sbf_feature = sklearn.kernel_approximation.RBFSampler \ (random_state=23, n_components=5) cls._sbf_X_transformed = cls.sbf_feature.fit_transform(cls._X) cls.Xf, cls.w, cls.b = fourier( RowMatrix(as_rdd_of_array(cls._spark_lo)), 5, 23, 1)
def loglik(data: pyspark.sql.DataFrame): """ Computes the log-likelihood using a multivariate normal model :param data: data for which loglik is computed :return: returns the loglik """ mvn = scipy.stats.multivariate_normal.pdf logger.info("Computing loglik") rdd = as_rdd_of_array(data) means = column_means(rdd) cov = correlation_matrix(rdd) loglik = (rdd # compute the loglik per observation .map(lambda x: scipy.log(mvn(x, means, cov))) # since the guys are in logspace we can summarize em .reduce(lambda x, y: x + y)) return loglik
def _precision(data): logger.info("Computing precision") X = as_rdd_of_array(data.select("features")) X = RowMatrix(center(X)) pres = precision(X) return pres
def _feature_matrix(self, data): return as_rdd_of_array(data.select(self.features))