コード例 #1
0
def sum_of_squared_errors(data: pyspark.sql.DataFrame):
    logger.info("Computing SSE")
    rdd = as_rdd_of_array(data)
    means = column_means(rdd)
    sse = (rdd.map(lambda x: (x - means).T.dot(x - means)).reduce(
        lambda x, y: x + y))
    return sse
コード例 #2
0
    def setUpClass(cls):
        super().setUpClass()
        cls.log("Stats")

        iris = datasets.load_iris()
        cls._X = iris.data[:10, :4]
        cls._X = scale(cls._X)
        cls._features = ["sl", "sw", "pl", "pw"]

        df = pandas.DataFrame(data=cls._X, columns=cls._features)
        cls._spark_lo = cls.spark().createDataFrame(df)

        cls.sbf_feature = sklearn.kernel_approximation.RBFSampler \
            (random_state=23, n_components=5)

        cls._sbf_X_transformed = cls.sbf_feature.fit_transform(cls._X)
        cls.Xf, cls.w, cls.b = fourier(
            RowMatrix(as_rdd_of_array(cls._spark_lo)), 5, 23, 1)
コード例 #3
0
def loglik(data: pyspark.sql.DataFrame):
    """
    Computes the log-likelihood using a multivariate normal model

    :param data: data for which loglik is computed
    :return: returns the loglik
    """
    mvn = scipy.stats.multivariate_normal.pdf
    logger.info("Computing loglik")
    rdd = as_rdd_of_array(data)
    means = column_means(rdd)
    cov = correlation_matrix(rdd)
    loglik = (rdd
              # compute the loglik per observation
              .map(lambda x: scipy.log(mvn(x, means, cov)))
              # since the guys are in logspace we can summarize em
              .reduce(lambda x, y: x + y))
    return loglik
コード例 #4
0
 def _precision(data):
     logger.info("Computing precision")
     X = as_rdd_of_array(data.select("features"))
     X = RowMatrix(center(X))
     pres = precision(X)
     return pres
コード例 #5
0
 def _feature_matrix(self, data):
     return as_rdd_of_array(data.select(self.features))