Example #1
0
File: stat.py Project: BViki/spark
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from pyspark.mllib.linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> pearsonCorr = Statistics.corr(rdd)
        >>> print str(pearsonCorr).replace('nan', 'NaN')
        [[ 1.          0.05564149         NaN  0.40047142]
         [ 0.05564149  1.                 NaN  0.91359586]
         [        NaN         NaN  1.                 NaN]
         [ 0.40047142  0.91359586         NaN  1.        ]]
        >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
        >>> print str(spearmanCorr).replace('nan', 'NaN')
        [[ 1.          0.10540926         NaN  0.4       ]
         [ 0.10540926  1.                 NaN  0.9486833 ]
         [        NaN         NaN  1.                 NaN]
         [ 0.4         0.9486833          NaN  1.        ]]
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")

        if not y:
            return callMLlibFunc("corr", x.map(_convert_to_vector), method).toArray()
        else:
            return callMLlibFunc("corr", x.map(float), y.map(float), method)
Example #2
0
    def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604):
        """
        Runs the bisecting k-means algorithm return the model.

        :param rdd:
          Training points as an `RDD` of `Vector` or convertible
          sequence types.
        :param k:
          The desired number of leaf clusters. The actual number could
          be smaller if there are no divisible leaf clusters.
          (default: 4)
        :param maxIterations:
          Maximum number of iterations allowed to split clusters.
          (default: 20)
        :param minDivisibleClusterSize:
          Minimum number of points (if >= 1.0) or the minimum proportion
          of points (if < 1.0) of a divisible cluster.
          (default: 1)
        :param seed:
          Random seed value for cluster initialization.
          (default: -1888008604 from classOf[BisectingKMeans].getName.##)
        """
        java_model = callMLlibFunc(
            "trainBisectingKMeans", rdd.map(_convert_to_vector),
            k, maxIterations, minDivisibleClusterSize, seed)
        return BisectingKMeansModel(java_model)
Example #3
0
    def update(self, data, decayFactor, timeUnit):
        """Update the centroids, according to data

        :param data:
          RDD with new data for the model update.
        :param decayFactor:
          Forgetfulness of the previous centroids.
        :param timeUnit:
          Can be "batches" or "points". If points, then the decay factor
          is raised to the power of number of new points and if batches,
          then decay factor will be used as is.
        """
        if not isinstance(data, RDD):
            raise TypeError("Data should be of an RDD, got %s." % type(data))
        data = data.map(_convert_to_vector)
        decayFactor = float(decayFactor)
        if timeUnit not in ["batches", "points"]:
            raise ValueError(
                "timeUnit should be 'batches' or 'points', got %s." % timeUnit)
        vectorCenters = [_convert_to_vector(center) for center in self.centers]
        updatedModel = callMLlibFunc(
            "updateStreamingKMeansModel", vectorCenters, self._clusterWeights,
            data, decayFactor, timeUnit)
        self.centers = array(updatedModel[0])
        self._clusterWeights = list(updatedModel[1])
        return self
Example #4
0
    def colStats(rdd):
        """
        Computes column-wise summary statistics for the input RDD[Vector].

        :param rdd: an RDD[Vector] for which column-wise summary statistics
                    are to be computed.
        :return: :class:`MultivariateStatisticalSummary` object containing
                 column-wise summary statistics.

        >>> from pyspark.mllib.linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
        ...                       Vectors.dense([4, 5, 0,  3]),
        ...                       Vectors.dense([6, 7, 0,  8])])
        >>> cStats = Statistics.colStats(rdd)
        >>> cStats.mean()
        array([ 4.,  4.,  0.,  3.])
        >>> cStats.variance()
        array([  4.,  13.,   0.,  25.])
        >>> cStats.count()
        3L
        >>> cStats.numNonzeros()
        array([ 3.,  2.,  0.,  3.])
        >>> cStats.max()
        array([ 6.,  7.,  0.,  8.])
        >>> cStats.min()
        array([ 2.,  0.,  0., -2.])
        """
        cStats = callMLlibFunc("colStats", rdd.map(_convert_to_vector))
        return MultivariateStatisticalSummary(cStats)
Example #5
0
    def __init__(self, rows, numRows=0, numCols=0):
        """
        Note: This docstring is not shown publicly.

        Create a wrapper over a Java RowMatrix.

        Publicly, we require that `rows` be an RDD.  However, for
        internal usage, `rows` can also be a Java RowMatrix
        object, in which case we can wrap it directly.  This
        assists in clean matrix conversions.

        >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
        >>> mat = RowMatrix(rows)

        >>> mat_diff = RowMatrix(rows)
        >>> (mat_diff._java_matrix_wrapper._java_model ==
        ...  mat._java_matrix_wrapper._java_model)
        False

        >>> mat_same = RowMatrix(mat._java_matrix_wrapper._java_model)
        >>> (mat_same._java_matrix_wrapper._java_model ==
        ...  mat._java_matrix_wrapper._java_model)
        True
        """
        if isinstance(rows, RDD):
            rows = rows.map(_convert_to_vector)
            java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols))
        elif (isinstance(rows, JavaObject)
              and rows.getClass().getSimpleName() == "RowMatrix"):
            java_matrix = rows
        else:
            raise TypeError("rows should be an RDD of vectors, got %s" % type(rows))

        self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
Example #6
0
    def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the Gamma
        distribution with the input shape and scale.

        :param sc: SparkContext used to create the RDD.
        :param shape: shape (> 0) parameter for the Gamma distribution
        :param scale: scale (> 0) parameter for the Gamma distribution
        :param size: Size of the RDD.
        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
        :param seed: Random seed (default: a random long integer).
        :return: RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale).

        >>> from math import sqrt
        >>> shape = 1.0
        >>> scale = 2.0
        >>> expMean = shape * scale
        >>> expStd = sqrt(shape * scale * scale)
        >>> x = RandomRDDs.gammaRDD(sc, shape, scale, 1000, seed=2)
        >>> stats = x.stats()
        >>> stats.count()
        1000
        >>> abs(stats.mean() - expMean) < 0.5
        True
        >>> abs(stats.stdev() - expStd) < 0.5
        True
        """
        return callMLlibFunc("gammaRDD", sc._jsc, float(shape),
                             float(scale), size, numPartitions, seed)
Example #7
0
File: util.py Project: BViki/spark
    def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        >>> type(loaded[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.1,(3,[0,2],[-1.23,4.56e-07]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (0.0,[1.01,2.02,3.03])
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
Example #8
0
 def train(cls, rdd, k, maxIterations=100, initMode="random"):
     """
     :param rdd:
       An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
       affinity matrix, which is the matrix A in the PIC paper.  The
       similarity s\ :sub:`ij`\ must be nonnegative.  This is a symmetric
       matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\  For any (i, j) with
       nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or
       (j, i, s\ :sub:`ji`\) in the input.  Tuples with i = j are ignored,
       because it is assumed s\ :sub:`ij`\ = 0.0.
     :param k:
       Number of clusters.
     :param maxIterations:
       Maximum number of iterations of the PIC algorithm.
       (default: 100)
     :param initMode:
       Initialization mode. This can be either "random" to use
       a random vector as vertex properties, or "degree" to use
       normalized sum similarities.
       (default: "random")
     """
     model = callMLlibFunc(
         "trainPowerIterationClusteringModel", rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode
     )
     return PowerIterationClusteringModel(model)
Example #9
0
    def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,
              seed=None):
        """
        Train a matrix factorization model given an RDD of ratings by users
        for a subset of products. The ratings matrix is approximated as the
        product of two lower-rank matrices of a given rank (number of
        features). To solve for these features, ALS is run iteratively with
        a configurable level of parallelism.

        :param ratings:
          RDD of `Rating` or (userID, productID, rating) tuple.
        :param rank:
          Rank of the feature matrices computed (number of features).
        :param iterations:
          Number of iterations of ALS.
          (default: 5)
        :param lambda_:
          Regularization parameter.
          (default: 0.01)
        :param blocks:
          Number of blocks used to parallelize the computation. A value
          of -1 will use an auto-configured number of blocks.
          (default: -1)
        :param nonnegative:
          A value of True will solve least-squares with nonnegativity
          constraints.
          (default: False)
        :param seed:
          Random seed for initial matrix factorization model. A value
          of None will use system time as the seed.
          (default: None)
        """
        model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
                              lambda_, blocks, nonnegative, seed)
        return MatrixFactorizationModel(model)
Example #10
0
 def fit(self, data):
     """
     Computes a [[PCAModel]] that contains the principal components of the input vectors.
     :param data: source vectors
     """
     jmodel = callMLlibFunc("fitPCA", self.k, data)
     return PCAModel(jmodel)
Example #11
0
 def train(
     cls,
     rdd,
     k,
     maxIterations=100,
     runs=1,
     initializationMode="k-means||",
     seed=None,
     initializationSteps=5,
     epsilon=1e-4,
     initialModel=None,
 ):
     """Train a k-means clustering model."""
     clusterInitialModel = []
     if initialModel is not None:
         if not isinstance(initialModel, KMeansModel):
             raise Exception(
                 "initialModel is of " + str(type(initialModel)) + ". It needs " "to be of <type 'KMeansModel'>"
             )
         clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
     model = callMLlibFunc(
         "trainKMeansModel",
         rdd.map(_convert_to_vector),
         k,
         maxIterations,
         runs,
         initializationMode,
         seed,
         initializationSteps,
         epsilon,
         clusterInitialModel,
     )
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Example #12
0
 def __init__(self, predictionAndLabels):
     sc = predictionAndLabels.ctx
     sql_ctx = SQLContext.getOrCreate(sc)
     df = sql_ctx.createDataFrame(predictionAndLabels,
                                  schema=sql_ctx._inferSchema(predictionAndLabels))
     java_model = callMLlibFunc("newRankingMetrics", df._jdf)
     super(RankingMetrics, self).__init__(java_model)
Example #13
0
 def train(
     cls,
     rdd,
     k,
     maxIterations=100,
     runs=1,
     initializationMode="k-means||",
     seed=None,
     initializationSteps=5,
     epsilon=1e-4,
 ):
     """Train a k-means clustering model."""
     model = callMLlibFunc(
         "trainKMeansModel",
         rdd.map(_convert_to_vector),
         k,
         maxIterations,
         runs,
         initializationMode,
         seed,
         initializationSteps,
         epsilon,
     )
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Example #14
0
    def uniformRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the
        uniform distribution U(0.0, 1.0).

        To transform the distribution in the generated RDD from U(0.0, 1.0)
        to U(a, b), use
        C{RandomRDDs.uniformRDD(sc, n, p, seed)\
          .map(lambda v: a + (b - a) * v)}

        :param sc: SparkContext used to create the RDD.
        :param size: Size of the RDD.
        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
        :param seed: Random seed (default: a random long integer).
        :return: RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`.

        >>> x = RandomRDDs.uniformRDD(sc, 100).collect()
        >>> len(x)
        100
        >>> max(x) <= 1.0 and min(x) >= 0.0
        True
        >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions()
        4
        >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions()
        >>> parts == sc.defaultParallelism
        True
        """
        return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed)
Example #15
0
 def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None):
     """Train a Gaussian Mixture clustering model."""
     weight, mu, sigma = callMLlibFunc("trainGaussianMixture",
                                       rdd.map(_convert_to_vector), k,
                                       convergenceTol, maxIterations, seed)
     mvg_obj = [MultivariateGaussian(mu[i], sigma[i]) for i in range(k)]
     return GaussianMixtureModel(weight, mvg_obj)
Example #16
0
File: fpm.py Project: 0xqq/spark
    def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000):
        """
        Finds the complete set of frequent sequential patterns in the
        input sequences of itemsets.

        :param data:
          The input data set, each element contains a sequence of
          itemsets.
        :param minSupport:
          The minimal support level of the sequential pattern, any
          pattern that appears more than (minSupport *
          size-of-the-dataset) times will be output.
          (default: 0.1)
        :param maxPatternLength:
          The maximal length of the sequential pattern, any pattern
          that appears less than maxPatternLength will be output.
          (default: 10)
        :param maxLocalProjDBSize:
          The maximum number of items (including delimiters used in the
          internal storage format) allowed in a projected database before
          local processing. If a projected database exceeds this size,
          another iteration of distributed prefix growth is run.
          (default: 32000000)
        """
        model = callMLlibFunc("trainPrefixSpanModel",
                              data, minSupport, maxPatternLength, maxLocalProjDBSize)
        return PrefixSpanModel(model)
Example #17
0
    def normalRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the standard normal
        distribution.

        To transform the distribution in the generated RDD from standard normal
        to some other normal N(mean, sigma^2), use
        C{RandomRDDs.normal(sc, n, p, seed)\
          .map(lambda v: mean + sigma * v)}

        :param sc: SparkContext used to create the RDD.
        :param size: Size of the RDD.
        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
        :param seed: Random seed (default: a random long integer).
        :return: RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0).

        >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1)
        >>> stats = x.stats()
        >>> stats.count()
        1000
        >>> abs(stats.mean() - 0.0) < 0.1
        True
        >>> abs(stats.stdev() - 1.0) < 0.1
        True
        """
        return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
Example #18
0
    def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the log normal
        distribution with the input mean and standard distribution.

        :param sc: SparkContext used to create the RDD.
        :param mean: mean for the log Normal distribution
        :param std: std for the log Normal distribution
        :param size: Size of the RDD.
        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
        :param seed: Random seed (default: a random long integer).
        :return: RDD of float comprised of i.i.d. samples ~ log N(mean, std).

        >>> from math import sqrt, exp
        >>> mean = 0.0
        >>> std = 1.0
        >>> expMean = exp(mean + 0.5 * std * std)
        >>> expStd = sqrt((exp(std * std) - 1.0) * exp(2.0 * mean + std * std))
        >>> x = RandomRDDs.logNormalRDD(sc, mean, std, 1000, seed=2)
        >>> stats = x.stats()
        >>> stats.count()
        1000
        >>> abs(stats.mean() - expMean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(stats.stdev() - expStd) < 0.5
        True
        """
        return callMLlibFunc("logNormalRDD", sc._jsc, float(mean), float(std),
                             size, numPartitions, seed)
Example #19
0
 def _train(
     cls,
     data,
     algo,
     numClasses,
     categoricalFeaturesInfo,
     numTrees,
     featureSubsetStrategy,
     impurity,
     maxDepth,
     maxBins,
     seed,
 ):
     first = data.first()
     assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
     if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
         raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
     if seed is None:
         seed = random.randint(0, 1 << 30)
     model = callMLlibFunc(
         "trainRandomForestModel",
         data,
         algo,
         numClasses,
         categoricalFeaturesInfo,
         numTrees,
         featureSubsetStrategy,
         impurity,
         maxDepth,
         maxBins,
         seed,
     )
     return RandomForestModel(model)
Example #20
0
    def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the log normal distribution.

        :param sc: SparkContext used to create the RDD.
        :param mean: Mean of the log normal distribution
        :param std: Standard Deviation of the log normal distribution
        :param numRows: Number of Vectors in the RDD.
        :param numCols: Number of elements in each Vector.
        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
        :param seed: Random seed (default: a random long integer).
        :return: RDD of Vector with vectors containing i.i.d. samples ~ log `N(mean, std)`.

        >>> import numpy as np
        >>> from math import sqrt, exp
        >>> mean = 0.0
        >>> std = 1.0
        >>> expMean = exp(mean + 0.5 * std * std)
        >>> expStd = sqrt((exp(std * std) - 1.0) * exp(2.0 * mean + std * std))
        >>> m = RandomRDDs.logNormalVectorRDD(sc, mean, std, 100, 100, seed=1).collect()
        >>> mat = np.matrix(m)
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - expMean) < 0.1
        True
        >>> abs(mat.std() - expStd) < 0.1
        True
        """
        return callMLlibFunc("logNormalVectorRDD", sc._jsc, float(mean), float(std),
                             numRows, numCols, numPartitions, seed)
Example #21
0
 def _train(
     cls,
     data,
     type,
     numClasses,
     features,
     impurity="gini",
     maxDepth=5,
     maxBins=32,
     minInstancesPerNode=1,
     minInfoGain=0.0,
 ):
     first = data.first()
     assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
     model = callMLlibFunc(
         "trainDecisionTreeModel",
         data,
         type,
         numClasses,
         features,
         impurity,
         maxDepth,
         maxBins,
         minInstancesPerNode,
         minInfoGain,
     )
     return DecisionTreeModel(model)
Example #22
0
    def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Exponential distribution with the input mean.

        :param sc: SparkContext used to create the RDD.
        :param mean: Mean, or 1 / lambda, for the Exponential distribution.
        :param numRows: Number of Vectors in the RDD.
        :param numCols: Number of elements in each Vector.
        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`)
        :param seed: Random seed (default: a random long integer).
        :return: RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean).

        >>> import numpy as np
        >>> mean = 0.5
        >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        return callMLlibFunc("exponentialVectorRDD", sc._jsc, float(mean), numRows, numCols,
                             numPartitions, seed)
Example #23
0
    def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Gamma distribution.

        :param sc: SparkContext used to create the RDD.
        :param shape: Shape (> 0) of the Gamma distribution
        :param scale: Scale (> 0) of the Gamma distribution
        :param numRows: Number of Vectors in the RDD.
        :param numCols: Number of elements in each Vector.
        :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
        :param seed: Random seed (default: a random long integer).
        :return: RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale).

        >>> import numpy as np
        >>> from math import sqrt
        >>> shape = 1.0
        >>> scale = 2.0
        >>> expMean = shape * scale
        >>> expStd = sqrt(shape * scale * scale)
        >>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - expMean) < 0.1
        True
        >>> abs(mat.std() - expStd) < 0.1
        True
        """
        return callMLlibFunc("gammaVectorRDD", sc._jsc, float(shape), float(scale),
                             numRows, numCols, numPartitions, seed)
Example #24
0
File: tree.py Project: 0xqq/spark
 def _train(cls, data, algo, categoricalFeaturesInfo,
            loss, numIterations, learningRate, maxDepth, maxBins):
     first = data.first()
     assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
     model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
                           loss, numIterations, learningRate, maxDepth, maxBins)
     return GradientBoostedTreesModel(model)
Example #25
0
 def trainImplicit(
     cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, nonnegative=False, seed=None
 ):
     model = callMLlibFunc(
         "trainImplicitALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, alpha, nonnegative, seed
     )
     return MatrixFactorizationModel(model)
Example #26
0
    def fit(self, dataset):
        """
        Computes the inverse document frequency.

        :param dataset: an RDD of term frequency vectors
        """
        jmodel = callMLlibFunc("fitIDF", self.minDocFreq, dataset)
        return IDFModel(jmodel)
Example #27
0
 def computeCost(self, rdd):
     """
     Return the K-means cost (sum of squared distances of points to
     their nearest center) for this model on the given data.
     """
     cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector),
                          [_convert_to_vector(c) for c in self.centers])
     return cost
Example #28
0
 def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"):
     """Train a k-means clustering model."""
     # cache serialized data to avoid objects over head in JVM
     jcached = _to_java_object_rdd(rdd.map(_convert_to_vector), cache=True)
     model = callMLlibFunc("trainKMeansModel", jcached, k, maxIterations, runs,
                           initializationMode)
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Example #29
0
File: util.py Project: AsafZ/spark
 def generateLinearRDD(sc, nexamples, nfeatures, eps,
                       nParts=2, intercept=0.0):
     """
     Generate a RDD of LabeledPoints.
     """
     return callMLlibFunc(
         "generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures),
         float(eps), int(nParts), float(intercept))
Example #30
0
    def kolmogorovSmirnovTest(data, distName="norm", *params):
        """
        .. note:: Experimental

        Performs the Kolmogorov-Smirnov (KS) test for data sampled from
        a continuous distribution. It tests the null hypothesis that
        the data is generated from a particular distribution.

        The given data is sorted and the Empirical Cumulative
        Distribution Function (ECDF) is calculated
        which for a given point is the number of points having a CDF
        value lesser than it divided by the total number of points.

        Since the data is sorted, this is a step function
        that rises by (1 / length of data) for every ordered point.

        The KS statistic gives us the maximum distance between the
        ECDF and the CDF. Intuitively if this statistic is large, the
        probabilty that the null hypothesis is true becomes small.
        For specific details of the implementation, please have a look
        at the Scala documentation.

        :param data: RDD, samples from the data
        :param distName: string, currently only "norm" is supported.
                         (Normal distribution) to calculate the
                         theoretical distribution of the data.
        :param params: additional values which need to be provided for
                       a certain distribution.
                       If not provided, the default values are used.
        :return: KolmogorovSmirnovTestResult object containing the test
                 statistic, degrees of freedom, p-value,
                 the method used, and the null hypothesis.

        >>> kstest = Statistics.kolmogorovSmirnovTest
        >>> data = sc.parallelize([-1.0, 0.0, 1.0])
        >>> ksmodel = kstest(data, "norm")
        >>> print(round(ksmodel.pValue, 3))
        1.0
        >>> print(round(ksmodel.statistic, 3))
        0.175
        >>> ksmodel.nullHypothesis
        u'Sample follows theoretical distribution'

        >>> data = sc.parallelize([2.0, 3.0, 4.0])
        >>> ksmodel = kstest(data, "norm", 3.0, 1.0)
        >>> print(round(ksmodel.pValue, 3))
        1.0
        >>> print(round(ksmodel.statistic, 3))
        0.175
        """
        if not isinstance(data, RDD):
            raise TypeError("data should be an RDD, got %s." % type(data))
        if not isinstance(distName, basestring):
            raise TypeError("distName should be a string, got %s." % type(distName))

        params = [float(param) for param in params]
        return KolmogorovSmirnovTestResult(
            callMLlibFunc("kolmogorovSmirnovTest", data, distName, params))
Example #31
0
    def __init__(self, rows, numRows=0, numCols=0):
        """
        Note: This docstring is not shown publicly.

        Create a wrapper over a Java IndexedRowMatrix.

        Publicly, we require that `rows` be an RDD.  However, for
        internal usage, `rows` can also be a Java IndexedRowMatrix
        object, in which case we can wrap it directly.  This
        assists in clean matrix conversions.

        >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
        ...                        IndexedRow(1, [4, 5, 6])])
        >>> mat = IndexedRowMatrix(rows)

        >>> mat_diff = IndexedRowMatrix(rows)
        >>> (mat_diff._java_matrix_wrapper._java_model ==
        ...  mat._java_matrix_wrapper._java_model)
        False

        >>> mat_same = IndexedRowMatrix(mat._java_matrix_wrapper._java_model)
        >>> (mat_same._java_matrix_wrapper._java_model ==
        ...  mat._java_matrix_wrapper._java_model)
        True
        """
        if isinstance(rows, RDD):
            rows = rows.map(_convert_to_indexed_row)
            # We use DataFrames for serialization of IndexedRows from
            # Python, so first convert the RDD to a DataFrame on this
            # side. This will convert each IndexedRow to a Row
            # containing the 'index' and 'vector' values, which can
            # both be easily serialized.  We will convert back to
            # IndexedRows on the Scala side.
            java_matrix = callMLlibFunc("createIndexedRowMatrix", rows.toDF(),
                                        long(numRows), int(numCols))
        elif (isinstance(rows, JavaObject)
              and rows.getClass().getSimpleName() == "IndexedRowMatrix"):
            java_matrix = rows
        else:
            raise TypeError(
                "rows should be an RDD of IndexedRows or (long, vector) tuples, "
                "got %s" % type(rows))

        self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
Example #32
0
    def uniformRDD(sc: SparkContext,
                   size: int,
                   numPartitions: Optional[int] = None,
                   seed: Optional[int] = None) -> RDD[float]:
        """
        Generates an RDD comprised of i.i.d. samples from the
        uniform distribution U(0.0, 1.0).

        To transform the distribution in the generated RDD from U(0.0, 1.0)
        to U(a, b), use
        ``RandomRDDs.uniformRDD(sc, n, p, seed).map(lambda v: a + (b - a) * v)``

        .. versionadded:: 1.1.0

        Parameters
        ----------
        sc : :py:class:`pyspark.SparkContext`
            used to create the RDD.
        size : int
            Size of the RDD.
        numPartitions : int, optional
            Number of partitions in the RDD (default: `sc.defaultParallelism`).
        seed : int, optional
            Random seed (default: a random long integer).

        Returns
        -------
        :py:class:`pyspark.RDD`
            RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`.

        Examples
        --------
        >>> x = RandomRDDs.uniformRDD(sc, 100).collect()
        >>> len(x)
        100
        >>> max(x) <= 1.0 and min(x) >= 0.0
        True
        >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions()
        4
        >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions()
        >>> parts == sc.defaultParallelism
        True
        """
        return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed)
Example #33
0
    def normalVectorRDD(
        sc: SparkContext,
        numRows: int,
        numCols: int,
        numPartitions: Optional[int] = None,
        seed: Optional[int] = None,
    ) -> RDD[Vector]:
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the standard normal distribution.

        .. versionadded:: 1.1.0

        Parameters
        ----------
        sc : :py:class:`pyspark.SparkContext`
            SparkContext used to create the RDD.
        numRows : int
            Number of Vectors in the RDD.
        numCols : int
            Number of elements in each Vector.
        numPartitions : int, optional
            Number of partitions in the RDD (default: `sc.defaultParallelism`).
        seed : int, optional
            Random seed (default: a random long integer).

        Returns
        -------
        :py:class:`pyspark.RDD`
            RDD of Vector with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`.

        Examples
        --------
        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - 0.0) < 0.1
        True
        >>> abs(mat.std() - 1.0) < 0.1
        True
        """
        return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols,
                             numPartitions, seed)
Example #34
0
    def uniformVectorRDD(
        sc: SparkContext,
        numRows: int,
        numCols: int,
        numPartitions: Optional[int] = None,
        seed: Optional[int] = None,
    ) -> RDD[Vector]:
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the uniform distribution U(0.0, 1.0).

        .. versionadded:: 1.1.0

        Parameters
        ----------
        sc : :py:class:`pyspark.SparkContext`
            SparkContext used to create the RDD.
        numRows : int
            Number of Vectors in the RDD.
        numCols : int
            Number of elements in each Vector.
        numPartitions : int, optional
            Number of partitions in the RDD.
        seed : int, optional
            Seed for the RNG that generates the seed for the generator in each partition.

        Returns
        -------
        :py:class:`pyspark.RDD`
            RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`.

        Examples
        --------
        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
        >>> mat.shape
        (10, 10)
        >>> mat.max() <= 1.0 and mat.min() >= 0.0
        True
        >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
        4
        """
        return callMLlibFunc("uniformVectorRDD", sc._jsc, numRows, numCols,
                             numPartitions, seed)
Example #35
0
    def predictSoft(self, x):
        """
        Find the membership of point 'x' or each point in RDD 'x' to all mixture components.

        :param x:
          A feature vector or an RDD of vectors representing data points.
        :return:
          The membership value to all mixture components for vector 'x'
          or each vector in RDD 'x'.
        """
        if isinstance(x, RDD):
            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
            membership_matrix = callMLlibFunc("predictSoftGMM",
                                              x.map(_convert_to_vector),
                                              _convert_to_vector(self.weights),
                                              means, sigmas)
            return membership_matrix.map(lambda x: pyarray.array('d', x))
        else:
            return self.call("predictSoft", _convert_to_vector(x)).toArray()
Example #36
0
    def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Poisson distribution with the input mean.

        >>> import numpy as np
        >>> mean = 100.0
        >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        return callMLlibFunc("poissonVectorRDD", sc._jsc, mean, numRows, numCols,
                             numPartitions, seed)
Example #37
0
    def load(cls, sc, path):
        """Load the LDAModel from disk.

        .. versionadded:: 1.5.0

        Parameters
        ----------
        sc : :py:class:`pyspark.SparkContext`
        path : str
            Path to where the model is stored.
        """
        if not isinstance(sc, SparkContext):
            raise TypeError("sc should be a SparkContext, got type %s" %
                            type(sc))
        if not isinstance(path, str):
            raise TypeError("path should be a string, got type %s" %
                            type(path))
        model = callMLlibFunc("loadLDAModel", sc, path)
        return LDAModel(model)
Example #38
0
    def computeCost(self, rdd):
        """
        Return the K-means cost (sum of squared distances of points to
        their nearest center) for this model on the given
        data.

        .. versionadded:: 1.4.0

        Parameters
        ----------
        rdd : ::py:class:`pyspark.RDD`
            The RDD of points to compute the cost on.
        """
        cost = callMLlibFunc(
            "computeCostKmeansModel",
            rdd.map(_convert_to_vector),
            [_convert_to_vector(c) for c in self.centers],
        )
        return cost
Example #39
0
    def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Exponential distribution with the input mean.

        .. versionadded:: 1.3.0

        Parameters
        ----------
        sc : :py:class:`pyspark.SparkContext`
            SparkContext used to create the RDD.
        mean : float
            Mean, or 1 / lambda, for the Exponential distribution.
        numRows : int
            Number of Vectors in the RDD.
        numCols : int
            Number of elements in each Vector.
        numPartitions : int, optional
            Number of partitions in the RDD (default: `sc.defaultParallelism`)
        seed : int, optional
            Random seed (default: a random long integer).

        Returns
        -------
        :py:class:`pyspark.RDD`
            RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean).

        Examples
        --------
        >>> import numpy as np
        >>> mean = 0.5
        >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        return callMLlibFunc("exponentialVectorRDD", sc._jsc, float(mean), numRows, numCols,
                             numPartitions, seed)
Example #40
0
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        :param data: RDD of LabeledPoint.
        :param lambda_: The smoothing parameter
        """
        first = data.first()
        if not isinstance(first, LabeledPoint):
            raise ValueError("`data` should be an RDD of LabeledPoint")
        labels, pi, theta = callMLlibFunc("trainNaiveBayes", data, lambda_)
        return NaiveBayesModel(labels.toArray(), pi.toArray(),
                               numpy.array(theta))
Example #41
0
    def __init__(self, entries, numRows=0, numCols=0):
        """
        Note: This docstring is not shown publicly.

        Create a wrapper over a Java CoordinateMatrix.

        Publicly, we require that `rows` be an RDD.  However, for
        internal usage, `rows` can also be a Java CoordinateMatrix
        object, in which case we can wrap it directly.  This
        assists in clean matrix conversions.

        >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
        ...                           MatrixEntry(6, 4, 2.1)])
        >>> mat = CoordinateMatrix(entries)

        >>> mat_diff = CoordinateMatrix(entries)
        >>> (mat_diff._java_matrix_wrapper._java_model ==
        ...  mat._java_matrix_wrapper._java_model)
        False

        >>> mat_same = CoordinateMatrix(mat._java_matrix_wrapper._java_model)
        >>> (mat_same._java_matrix_wrapper._java_model ==
        ...  mat._java_matrix_wrapper._java_model)
        True
        """
        if isinstance(entries, RDD):
            entries = entries.map(_convert_to_matrix_entry)
            # We use DataFrames for serialization of MatrixEntry entries
            # from Python, so first convert the RDD to a DataFrame on
            # this side. This will convert each MatrixEntry to a Row
            # containing the 'i', 'j', and 'value' values, which can
            # each be easily serialized. We will convert back to
            # MatrixEntry inputs on the Scala side.
            java_matrix = callMLlibFunc("createCoordinateMatrix", entries.toDF(),
                                        long(numRows), long(numCols))
        elif (isinstance(entries, JavaObject)
              and entries.getClass().getSimpleName() == "CoordinateMatrix"):
            java_matrix = entries
        else:
            raise TypeError("entries should be an RDD of MatrixEntry entries or "
                            "(long, long, float) tuples, got %s" % type(entries))

        self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
Example #42
0
 def train(cls, rdd, k, maxIterations=100, initMode="random"):
     """
     :param rdd: an RDD of (i, j, s,,ij,,) tuples representing the
         affinity matrix, which is the matrix A in the PIC paper.
         The similarity s,,ij,, must be nonnegative.
         This is a symmetric matrix and hence s,,ij,, = s,,ji,,.
         For any (i, j) with nonzero similarity, there should be
         either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input.
         Tuples with i = j are ignored, because we assume
         s,,ij,, = 0.0.
     :param k: Number of clusters.
     :param maxIterations: Maximum number of iterations of the
         PIC algorithm.
     :param initMode: Initialization mode.
     """
     model = callMLlibFunc("trainPowerIterationClusteringModel",
                           rdd.map(_convert_to_vector), int(k),
                           int(maxIterations), initMode)
     return PowerIterationClusteringModel(model)
Example #43
0
    def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initialModel=None):
        """
        Train a Gaussian Mixture clustering model.

        .. versionadded:: 1.3.0

        Parameters
        ----------
        rdd : ::py:class:`pyspark.RDD`
            Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector`
            or convertible sequence types.
        k : int
            Number of independent Gaussians in the mixture model.
        convergenceTol : float, optional
            Maximum change in log-likelihood at which convergence is
            considered to have occurred.
            (default: 1e-3)
        maxIterations : int, optional
            Maximum number of iterations allowed.
            (default: 100)
        seed : int, optional
            Random seed for initial Gaussian distribution. Set as None to
            generate seed based on system time.
            (default: None)
        initialModel : GaussianMixtureModel, optional
            Initial GMM starting point, bypassing the random
            initialization.
            (default: None)
        """
        initialModelWeights = None
        initialModelMu = None
        initialModelSigma = None
        if initialModel is not None:
            if initialModel.k != k:
                raise Exception("Mismatched cluster count, initialModel.k = %s, however k = %s"
                                % (initialModel.k, k))
            initialModelWeights = list(initialModel.weights)
            initialModelMu = [initialModel.gaussians[i].mu for i in range(initialModel.k)]
            initialModelSigma = [initialModel.gaussians[i].sigma for i in range(initialModel.k)]
        java_model = callMLlibFunc("trainGaussianMixtureModel", rdd.map(_convert_to_vector),
                                   k, convergenceTol, maxIterations, seed,
                                   initialModelWeights, initialModelMu, initialModelSigma)
        return GaussianMixtureModel(java_model)
Example #44
0
    def train(cls, data, isotonic=True):
        """
        Train an isotonic regression model on the given data.

        .. versionadded:: 1.4.0

        Parameters
        ----------
        data : :py:class:`pyspark.RDD`
            RDD of (label, feature, weight) tuples.
        isotonic : bool, optional
            Whether this is isotonic (which is default) or antitonic.
            (default: True)
        """
        boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel",
                                                data.map(_convert_to_vector),
                                                bool(isotonic))
        return IsotonicRegressionModel(boundaries.toArray(),
                                       predictions.toArray(), isotonic)
Example #45
0
    def train(
        self,
        rdd: RDD["VectorLike"],
        k: int = 4,
        maxIterations: int = 20,
        minDivisibleClusterSize: float = 1.0,
        seed: int = -1888008604,
    ) -> BisectingKMeansModel:
        """
        Runs the bisecting k-means algorithm return the model.

        .. versionadded:: 2.0.0

        Parameters
        ----------
        rdd : :py:class:`pyspark.RDD`
            Training points as an `RDD` of `Vector` or convertible
            sequence types.
        k : int, optional
            The desired number of leaf clusters. The actual number could
            be smaller if there are no divisible leaf clusters.
            (default: 4)
        maxIterations : int, optional
            Maximum number of iterations allowed to split clusters.
            (default: 20)
        minDivisibleClusterSize : float, optional
            Minimum number of points (if >= 1.0) or the minimum proportion
            of points (if < 1.0) of a divisible cluster.
            (default: 1)
        seed : int, optional
            Random seed value for cluster initialization.
            (default: -1888008604 from classOf[BisectingKMeans].getName.##)
        """
        java_model = callMLlibFunc(
            "trainBisectingKMeans",
            rdd.map(_convert_to_vector),
            k,
            maxIterations,
            minDivisibleClusterSize,
            seed,
        )
        return BisectingKMeansModel(java_model)
Example #46
0
    def normalRDD(sc: SparkContext,
                  size: int,
                  numPartitions: Optional[int] = None,
                  seed: Optional[int] = None) -> RDD[float]:
        """
        Generates an RDD comprised of i.i.d. samples from the standard normal
        distribution.

        To transform the distribution in the generated RDD from standard normal
        to some other normal N(mean, sigma^2), use
        ``RandomRDDs.normal(sc, n, p, seed).map(lambda v: mean + sigma * v)``

        .. versionadded:: 1.1.0

        Parameters
        ----------
        sc : :py:class:`pyspark.SparkContext`
            used to create the RDD.
        size : int
            Size of the RDD.
        numPartitions : int, optional
            Number of partitions in the RDD (default: `sc.defaultParallelism`).
        seed : int, optional
            Random seed (default: a random long integer).

        Returns
        -------
        :py:class:`pyspark.RDD`
            RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0).

        Examples
        --------
        >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1)
        >>> stats = x.stats()
        >>> stats.count()
        1000
        >>> abs(stats.mean() - 0.0) < 0.1
        True
        >>> abs(stats.stdev() - 1.0) < 0.1
        True
        """
        return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
Example #47
0
    def generateLinearInput(intercept, weights, xMean, xVariance, nPoints,
                            seed, eps):
        """
        .. versionadded:: 1.5.0

        Parameters
        ----------
        intercept : float
            bias factor, the term c in X'w + c
        weights : :py:class:`pyspark.mllib.linalg.Vector` or convertible
            feature vector, the term w in X'w + c
        xMean : :py:class:`pyspark.mllib.linalg.Vector` or convertible
            Point around which the data X is centered.
        xVariance : :py:class:`pyspark.mllib.linalg.Vector` or convertible
            Variance of the given data
        nPoints : int
            Number of points to be generated
        seed : int
            Random Seed
        eps : float
            Used to scale the noise. If eps is set high,
            the amount of gaussian noise added is more.

        Returns
        -------
        list
            of :py:class:`pyspark.mllib.regression.LabeledPoints` of length nPoints
        """
        weights = [float(weight) for weight in weights]
        xMean = [float(mean) for mean in xMean]
        xVariance = [float(var) for var in xVariance]
        return list(
            callMLlibFunc(
                "generateLinearInputWrapper",
                float(intercept),
                weights,
                xMean,
                xVariance,
                int(nPoints),
                int(seed),
                float(eps),
            ))
Example #48
0
    def normalRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the standard normal
        distribution.

        To transform the distribution in the generated RDD from standard normal
        to some other normal N(mean, sigma^2), use
        C{RandomRDDs.normal(sc, n, p, seed)\
          .map(lambda v: mean + sigma * v)}

        >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - 0.0) < 0.1
        True
        >>> abs(stats.stdev() - 1.0) < 0.1
        True
        """
        return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
Example #49
0
 def generateLinearRDD(
     sc: SparkContext,
     nexamples: int,
     nfeatures: int,
     eps: float,
     nParts: int = 2,
     intercept: float = 0.0,
 ) -> RDD["LabeledPoint"]:
     """
     Generate an RDD of LabeledPoints.
     """
     return callMLlibFunc(
         "generateLinearRDDWrapper",
         sc,
         int(nexamples),
         int(nfeatures),
         float(eps),
         int(nParts),
         float(intercept),
     )
Example #50
0
    def generateLinearInput(intercept, weights, xMean, xVariance,
                            nPoints, seed, eps):
        """
        :param: intercept bias factor, the term c in X'w + c
        :param: weights   feature vector, the term w in X'w + c
        :param: xMean     Point around which the data X is centered.
        :param: xVariance Variance of the given data
        :param: nPoints   Number of points to be generated
        :param: seed      Random Seed
        :param: eps       Used to scale the noise. If eps is set high,
                          the amount of gaussian noise added is more.

        Returns a list of LabeledPoints of length nPoints
        """
        weights = [float(weight) for weight in weights]
        xMean = [float(mean) for mean in xMean]
        xVariance = [float(var) for var in xVariance]
        return list(callMLlibFunc(
            "generateLinearInputWrapper", float(intercept), weights, xMean,
            xVariance, int(nPoints), int(seed), float(eps)))
Example #51
0
    def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0,
              topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"):
        """Train a LDA model.

        :param rdd:                 RDD of data points
        :param k:                   Number of clusters you want
        :param maxIterations:       Number of iterations. Default to 20
        :param docConcentration:    Concentration parameter (commonly named "alpha")
            for the prior placed on documents' distributions over topics ("theta").
        :param topicConcentration:  Concentration parameter (commonly named "beta" or "eta")
            for the prior placed on topics' distributions over terms.
        :param seed:                Random Seed
        :param checkpointInterval:  Period (in iterations) between checkpoints.
        :param optimizer:           LDAOptimizer used to perform the actual calculation.
            Currently "em", "online" are supported. Default to "em".
        """
        model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations,
                              docConcentration, topicConcentration, seed,
                              checkpointInterval, optimizer)
        return LDAModel(model)
Example #52
0
 def trainImplicit(cls,
                   ratings,
                   rank,
                   iterations=5,
                   lambda_=0.01,
                   blocks=-1,
                   alpha=0.01,
                   nonnegative=False,
                   seed=None):
     """
     Train a matrix factorization model given an RDD of 'implicit preferences' given by users
     to some products, in the form of (userID, productID, preference) pairs. We approximate the
     ratings matrix as the product of two lower-rank matrices of a given rank (number of
     features).  To solve for these features, we run a given number of iterations of ALS.
     This is done using a level of parallelism given by `blocks`.
     """
     model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings),
                           rank, iterations, lambda_, blocks, alpha,
                           nonnegative, seed)
     return MatrixFactorizationModel(model)
Example #53
0
    def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604):
        """
        Runs the bisecting k-means algorithm return the model.

        :param rdd: input RDD to be trained on
        :param k: The desired number of leaf clusters (default: 4).
            The actual number could be smaller if there are no divisible
            leaf clusters.
        :param maxIterations: the max number of k-means iterations to
            split clusters (default: 20)
        :param minDivisibleClusterSize: the minimum number of points
            (if >= 1.0) or the minimum proportion of points (if < 1.0)
            of a divisible cluster (default: 1)
        :param seed: a random seed (default: -1888008604 from
            classOf[BisectingKMeans].getName.##)
        """
        java_model = callMLlibFunc(
            "trainBisectingKMeans", rdd.map(_convert_to_vector),
            k, maxIterations, minDivisibleClusterSize, seed)
        return BisectingKMeansModel(java_model)
Example #54
0
    def entries(self):
        """
        Entries of the CoordinateMatrix stored as an RDD of
        MatrixEntries.

        >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2),
        ...                                        MatrixEntry(6, 4, 2.1)]))
        >>> entries = mat.entries
        >>> entries.first()
        MatrixEntry(0, 0, 1.2)
        """
        # We use DataFrames for serialization of MatrixEntry entries
        # from Java, so we first convert the RDD of entries to a
        # DataFrame on the Scala/Java side. Then we map each Row in
        # the DataFrame back to a MatrixEntry on this side.
        entries_df = callMLlibFunc("getMatrixEntries",
                                   self._java_matrix_wrapper._java_model)
        entries = entries_df.map(
            lambda row: MatrixEntry(row[0], row[1], row[2]))
        return entries
Example #55
0
    def train(
        cls,
        rdd: RDD[Tuple[int, int, float]],
        k: int,
        maxIterations: int = 100,
        initMode: str = "random",
    ) -> PowerIterationClusteringModel:
        r"""
        Train PowerIterationClusteringModel

        .. versionadded:: 1.5.0

        Parameters
        ----------
        rdd : :py:class:`pyspark.RDD`
            An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
            affinity matrix, which is the matrix A in the PIC paper.  The
            similarity s\ :sub:`ij`\ must be nonnegative.  This is a symmetric
            matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\  For any (i, j) with
            nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or
            (j, i, s\ :sub:`ji`\) in the input.  Tuples with i = j are ignored,
            because it is assumed s\ :sub:`ij`\ = 0.0.
        k : int
            Number of clusters.
        maxIterations : int, optional
            Maximum number of iterations of the PIC algorithm.
            (default: 100)
        initMode : str, optional
            Initialization mode. This can be either "random" to use
            a random vector as vertex properties, or "degree" to use
            normalized sum similarities.
            (default: "random")
        """
        model = callMLlibFunc(
            "trainPowerIterationClusteringModel",
            rdd.map(_convert_to_vector),
            int(k),
            int(maxIterations),
            initMode,
        )
        return PowerIterationClusteringModel(model)
Example #56
0
    def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0,
              topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"):
        """Train a LDA model.

        :param rdd:
          RDD of documents, which are tuples of document IDs and term
          (word) count vectors. The term count vectors are "bags of
          words" with a fixed-size vocabulary (where the vocabulary size
          is the length of the vector). Document IDs must be unique
          and >= 0.
        :param k:
          Number of topics to infer, i.e., the number of soft cluster
          centers.
          (default: 10)
        :param maxIterations:
          Maximum number of iterations allowed.
          (default: 20)
        :param docConcentration:
          Concentration parameter (commonly named "alpha") for the prior
          placed on documents' distributions over topics ("theta").
          (default: -1.0)
        :param topicConcentration:
          Concentration parameter (commonly named "beta" or "eta") for
          the prior placed on topics' distributions over terms.
          (default: -1.0)
        :param seed:
          Random seed for cluster initialization. Set as None to generate
          seed based on system time.
          (default: None)
        :param checkpointInterval:
          Period (in iterations) between checkpoints.
          (default: 10)
        :param optimizer:
          LDAOptimizer used to perform the actual calculation. Currently
          "em", "online" are supported.
          (default: "em")
        """
        model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations,
                              docConcentration, topicConcentration, seed,
                              checkpointInterval, optimizer)
        return LDAModel(model)
Example #57
0
    def train(cls,
              ratings,
              rank,
              iterations=5,
              lambda_=0.01,
              blocks=-1,
              nonnegative=False,
              seed=None):
        """
        Train a matrix factorization model given an RDD of ratings by users
        for a subset of products. The ratings matrix is approximated as the
        product of two lower-rank matrices of a given rank (number of
        features). To solve for these features, ALS is run iteratively with
        a configurable level of parallelism.

        :param ratings:
          RDD of `Rating` or (userID, productID, rating) tuple.
        :param rank:
          Number of features to use (also referred to as the number of latent factors).
        :param iterations:
          Number of iterations of ALS.
          (default: 5)
        :param lambda_:
          Regularization parameter.
          (default: 0.01)
        :param blocks:
          Number of blocks used to parallelize the computation. A value
          of -1 will use an auto-configured number of blocks.
          (default: -1)
        :param nonnegative:
          A value of True will solve least-squares with nonnegativity
          constraints.
          (default: False)
        :param seed:
          Random seed for initial matrix factorization model. A value
          of None will use system time as the seed.
          (default: None)
        """
        model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank,
                              iterations, lambda_, blocks, nonnegative, seed)
        return MatrixFactorizationModel(model)
Example #58
0
    def train(cls,
              data,
              minSupport=0.1,
              maxPatternLength=10,
              maxLocalProjDBSize=32000000):
        """
        Finds the complete set of frequent sequential patterns in the input sequences of itemsets.

        :param data: The input data set, each element contains a sequnce of itemsets.
        :param minSupport: the minimal support level of the sequential pattern, any pattern appears
            more than  (minSupport * size-of-the-dataset) times will be output (default: `0.1`)
        :param maxPatternLength: the maximal length of the sequential pattern, any pattern appears
            less than maxPatternLength will be output. (default: `10`)
        :param maxLocalProjDBSize: The maximum number of items (including delimiters used in
            the internal storage format) allowed in a projected database before local
            processing. If a projected database exceeds this size, another
            iteration of distributed prefix growth is run. (default: `32000000`)
        """
        model = callMLlibFunc("trainPrefixSpanModel", data, minSupport,
                              maxPatternLength, maxLocalProjDBSize)
        return PrefixSpanModel(model)
Example #59
0
    def fit(self, dataset):
        """
        Computes the mean and variance and stores as a model to be used
        for later scaling.

        .. versionadded:: 1.2.0

        Parameters
        ----------
        dataset : :py:class:`pyspark.RDD`
            The data used to compute the mean and variance
            to build the transformation model.

        Returns
        -------
        :py:class:`StandardScalerModel`
        """
        dataset = dataset.map(_convert_to_vector)
        jmodel = callMLlibFunc("fitStandardScaler", self.withMean,
                               self.withStd, dataset)
        return StandardScalerModel(jmodel)
Example #60
0
    def blocks(self):
        """
        The RDD of sub-matrix blocks
        ((blockRowIndex, blockColIndex), sub-matrix) that form this
        distributed matrix.

        >>> mat = BlockMatrix(
        ...     sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
        ...                     ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2)
        >>> blocks = mat.blocks
        >>> blocks.first()
        ((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0))

        """
        # We use DataFrames for serialization of sub-matrix blocks
        # from Java, so we first convert the RDD of blocks to a
        # DataFrame on the Scala/Java side. Then we map each Row in
        # the DataFrame back to a sub-matrix block on this side.
        blocks_df = callMLlibFunc("getMatrixBlocks", self._java_matrix_wrapper._java_model)
        blocks = blocks_df.rdd.map(lambda row: ((row[0][0], row[0][1]), row[1]))
        return blocks