Exemple #1
0
 def appendBias(data):
     """
     Returns a new vector with `1.0` (bias) appended to
     the end of the input vector.
     """
     vec = _convert_to_vector(data)
     if isinstance(vec, SparseVector):
         newIndices = np.append(vec.indices, len(vec))
         newValues = np.append(vec.values, 1.0)
         return SparseVector(len(vec) + 1, newIndices, newValues)
     else:
         return _convert_to_vector(np.append(vec.toArray(), 1.0))
Exemple #2
0
def _regression_train_wrapper(train_func, modelClass, data, initial_weights):
    from pyspark.mllib.classification import LogisticRegressionModel
    first = data.first()
    if not isinstance(first, LabeledPoint):
        raise TypeError("data should be an RDD of LabeledPoint, but got %s" % type(first))
    if initial_weights is None:
        initial_weights = [0.0] * len(data.first().features)
    if (modelClass == LogisticRegressionModel):
        weights, intercept, numFeatures, numClasses = train_func(
            data, _convert_to_vector(initial_weights))
        return modelClass(weights, intercept, numFeatures, numClasses)
    else:
        weights, intercept = train_func(data, _convert_to_vector(initial_weights))
        return modelClass(weights, intercept)
Exemple #3
0
    def predictSoft(self, x):
        """
        Find the membership of point 'x' or each point in RDD 'x' to all mixture components.

        :param x:    vector or RDD of vector represents data points.
        :return:     the membership value to all mixture components for vector 'x'
                     or each vector in RDD 'x'.
        """
        if isinstance(x, RDD):
            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
            membership_matrix = callMLlibFunc(
                "predictSoftGMM", x.map(_convert_to_vector), _convert_to_vector(self.weights), means, sigmas
            )
            return membership_matrix.map(lambda x: pyarray.array("d", x))
        else:
            return self.call("predictSoft", _convert_to_vector(x)).toArray()
Exemple #4
0
 def save(self, sc, path):
     """
     Save this model to the given path.
     """
     java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers])
     java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
     java_model.save(sc._jsc.sc(), path)
Exemple #5
0
 def train(
     cls,
     rdd,
     k,
     maxIterations=100,
     runs=1,
     initializationMode="k-means||",
     seed=None,
     initializationSteps=5,
     epsilon=1e-4,
     initialModel=None,
 ):
     """Train a k-means clustering model."""
     clusterInitialModel = []
     if initialModel is not None:
         if not isinstance(initialModel, KMeansModel):
             raise Exception(
                 "initialModel is of " + str(type(initialModel)) + ". It needs " "to be of <type 'KMeansModel'>"
             )
         clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
     model = callMLlibFunc(
         "trainKMeansModel",
         rdd.map(_convert_to_vector),
         k,
         maxIterations,
         runs,
         initializationMode,
         seed,
         initializationSteps,
         epsilon,
         clusterInitialModel,
     )
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])
Exemple #6
0
    def update(self, data, decayFactor, timeUnit):
        """Update the centroids, according to data

        :param data:
          RDD with new data for the model update.
        :param decayFactor:
          Forgetfulness of the previous centroids.
        :param timeUnit:
          Can be "batches" or "points". If points, then the decay factor
          is raised to the power of number of new points and if batches,
          then decay factor will be used as is.
        """
        if not isinstance(data, RDD):
            raise TypeError("Data should be of an RDD, got %s." % type(data))
        data = data.map(_convert_to_vector)
        decayFactor = float(decayFactor)
        if timeUnit not in ["batches", "points"]:
            raise ValueError(
                "timeUnit should be 'batches' or 'points', got %s." % timeUnit)
        vectorCenters = [_convert_to_vector(center) for center in self.centers]
        updatedModel = callMLlibFunc(
            "updateStreamingKMeansModel", vectorCenters, self._clusterWeights,
            data, decayFactor, timeUnit)
        self.centers = array(updatedModel[0])
        self._clusterWeights = list(updatedModel[1])
        return self
Exemple #7
0
    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD()
            jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)
Exemple #8
0
 def predict(self, x):
     """
     Predict the value of the dependent variable given a vector x
     containing values for the independent variables.
     """
     x = _convert_to_vector(x)
     return self.weights.dot(x) + self.intercept
Exemple #9
0
def _regression_train_wrapper(train_func, modelClass, data, initial_weights):
    first = data.first()
    if not isinstance(first, LabeledPoint):
        raise ValueError("data should be an RDD of LabeledPoint, but got %s" % first)
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    weights, intercept = train_func(data, _convert_to_vector(initial_weights))
    return modelClass(weights, intercept)
Exemple #10
0
 def computeCost(self, rdd):
     """
     Return the K-means cost (sum of squared distances of points to
     their nearest center) for this model on the given data.
     """
     cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector),
                          [_convert_to_vector(c) for c in self.centers])
     return cost
Exemple #11
0
    def test_serialize(self):
        from scipy.sparse import lil_matrix
        lil = lil_matrix((4, 1))
        lil[1, 0] = 1
        lil[3, 0] = 2
        sv = SparseVector(4, {1: 1, 3: 2})
        self.assertEqual(sv, _convert_to_vector(lil))
        self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
        self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
        self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
        self.assertEqual(sv, _convert_to_vector(lil.todok()))

        def serialize(l):
            return ser.loads(ser.dumps(_convert_to_vector(l)))
        self.assertEqual(sv, serialize(lil))
        self.assertEqual(sv, serialize(lil.tocsc()))
        self.assertEqual(sv, serialize(lil.tocsr()))
        self.assertEqual(sv, serialize(lil.todok()))
Exemple #12
0
 def predict(self, x):
     x = _convert_to_vector(x)
     margin = self.weights.dot(x) + self._intercept
     if margin > 0:
         prob = 1 / (1 + exp(-margin))
     else:
         exp_margin = exp(margin)
         prob = exp_margin / (1 + exp_margin)
     return 1 if prob > 0.5 else 0
Exemple #13
0
    def setInitialWeights(self, initialWeights):
        """
        Set the initial value of weights.

        This must be set before running trainOn and predictOn
        """
        initialWeights = _convert_to_vector(initialWeights)
        self._model = LinearRegressionModel(initialWeights, 0)
        return self
Exemple #14
0
 def predict(self, x):
     """
     Predict the value of the dependent variable given a vector or
     an RDD of vectors containing values for the independent variables.
     """
     if isinstance(x, RDD):
         return x.map(self.predict)
     x = _convert_to_vector(x)
     return self.weights.dot(x) + self.intercept
Exemple #15
0
 def predict(self, x):
     """
     Return the most likely class for a data vector
     or an RDD of vectors
     """
     if isinstance(x, RDD):
         return x.map(lambda v: self.predict(v))
     x = _convert_to_vector(x)
     return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]
 def predict_all(self, x):
     if isinstance(x, RDD):
         return x.map(lambda v: self.predict_all(v))
     x = _convert_to_vector(x)
     log_probs = self.pi + x.dot(self.theta.transpose())
     scaled_log_probs = scale(log_probs)
     int_lables = [int(l_i) for l_i in self.labels]
     labels_and_log_probs = zip(int_lables, scaled_log_probs)
     return sorted(labels_and_log_probs, key=lambda x: x[1], reverse=True)
Exemple #17
0
 def test_convert_to_vector(self):
     from scipy.sparse import csc_matrix
     # Create a CSC matrix with non-sorted indices
     indptr = array([0, 2])
     indices = array([3, 1])
     data = array([2.0, 1.0])
     csc = csc_matrix((data, indices, indptr))
     self.assertFalse(csc.has_sorted_indices)
     sv = SparseVector(4, {1: 1, 3: 2})
     self.assertEqual(sv, _convert_to_vector(csc))
Exemple #18
0
    def transform(self, vector):
        """
        Computes the Hadamard product of the vector.
        """
        if isinstance(vector, RDD):
            vector = vector.map(_convert_to_vector)

        else:
            vector = _convert_to_vector(vector)
        return callMLlibFunc("elementwiseProductVector", self.scalingVector, vector)
Exemple #19
0
    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points using
        the model trained.
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))
Exemple #20
0
 def findSynonyms(self, word, num):
     """
     Find "num" number of words closest in similarity to "word".
     word can be a string or vector representation.
     Returns a dataframe with two fields word and similarity (which
     gives the cosine similarity).
     """
     if not isinstance(word, basestring):
         word = _convert_to_vector(word)
     return self._call_java("findSynonyms", word, num)
Exemple #21
0
    def setInitialWeights(self, initialWeights):
        """
        Set the initial value of weights.

        This must be set before running trainOn and predictOn.
        """
        initialWeights = _convert_to_vector(initialWeights)

        # LogisticRegressionWithSGD does only binary classification.
        self._model = LogisticRegressionModel(initialWeights, 0, initialWeights.size, 2)
        return self
Exemple #22
0
 def predict(self, x):
     """Find the cluster to which x belongs in this model."""
     best = 0
     best_distance = float("inf")
     x = _convert_to_vector(x)
     for i in xrange(len(self.centers)):
         distance = x.squared_distance(self.centers[i])
         if distance < best_distance:
             best = i
             best_distance = distance
     return best
Exemple #23
0
    def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
              seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None):
        """
        Train a k-means clustering model.

        :param rdd:
          Training points as an `RDD` of `Vector` or convertible
          sequence types.
        :param k:
          Number of clusters to create.
        :param maxIterations:
          Maximum number of iterations allowed.
          (default: 100)
        :param runs:
          Number of runs to execute in parallel. The best model according
          to the cost function will be returned (deprecated in 1.6.0).
          (default: 1)
        :param initializationMode:
          The initialization algorithm. This can be either "random" or
          "k-means||".
          (default: "k-means||")
        :param seed:
          Random seed value for cluster initialization. Set as None to
          generate seed based on system time.
          (default: None)
        :param initializationSteps:
          Number of steps for the k-means|| initialization mode.
          This is an advanced setting -- the default of 5 is almost
          always enough.
          (default: 5)
        :param epsilon:
          Distance threshold within which a center will be considered to
          have converged. If all centers move less than this Euclidean
          distance, iterations are stopped.
          (default: 1e-4)
        :param initialModel:
          Initial cluster centers can be provided as a KMeansModel object
          rather than using the random or k-means|| initializationModel.
          (default: None)
        """
        if runs != 1:
            warnings.warn(
                "Support for runs is deprecated in 1.6.0. This param will have no effect in 2.0.0.")
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise Exception("initialModel is of "+str(type(initialModel))+". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
        model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations,
                              runs, initializationMode, seed, initializationSteps, epsilon,
                              clusterInitialModel)
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])
Exemple #24
0
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights):
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    ser = PickleSerializer()
    initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights)))
    # use AutoBatchedSerializer before cache to reduce the memory
    # overhead in JVM
    cached = data._reserialize(AutoBatchedSerializer(ser)).cache()
    ans = train_func(cached._to_java_object_rdd(), initial_bytes)
    assert len(ans) == 2, "JVM call result had unexpected length"
    weights = ser.loads(str(ans[0]))
    return modelClass(weights, ans[1])
Exemple #25
0
    def predictSoft(self, x):
        """
        Find the membership of each point in 'x' to all mixture components.

        :param x:    RDD of data points.
        :return:     membership_matrix. RDD of array of double values.
        """
        if isinstance(x, RDD):
            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
            membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector),
                                              _convert_to_vector(self.weights), means, sigmas)
            return membership_matrix.map(lambda x: pyarray.array('d', x))
Exemple #26
0
    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))
Exemple #27
0
    def transform(self, vector):
        """
        Applies unit length normalization on a vector.

        :param vector: vector or RDD of vector to be normalized.
        :return: normalized vector. If the norm of the input is zero, it
                 will return the input vector.
        """
        if isinstance(vector, RDD):
            vector = vector.map(_convert_to_vector)
        else:
            vector = _convert_to_vector(vector)
        return callMLlibFunc("normalizeVector", self.p, vector)
Exemple #28
0
 def _convert_labeled_point_to_libsvm(p):
     """Converts a LabeledPoint to a string in LIBSVM format."""
     assert isinstance(p, LabeledPoint)
     items = [str(p.label)]
     v = _convert_to_vector(p.features)
     if isinstance(v, SparseVector):
         nnz = len(v.indices)
         for i in xrange(nnz):
             items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
     else:
         for i in xrange(len(v)):
             items.append(str(i + 1) + ":" + str(v[i]))
     return " ".join(items)
Exemple #29
0
    def computeCost(self, x):
        """
        Return the Bisecting K-means cost (sum of squared distances of
        points to their nearest center) for this model on the given
        data. If provided with an RDD of points returns the sum.

        :param point: the point or RDD of points to compute the cost(s).
        """
        if isinstance(x, RDD):
            vecs = x.map(_convert_to_vector)
            return self.call("computeCost", vecs)

        return self.call("computeCost", _convert_to_vector(x))
Exemple #30
0
    def findSynonyms(self, word, num):
        """
        Find synonyms of a word

        :param word: a word or a vector representation of word
        :param num: number of synonyms to find
        :return: array of (word, cosineSimilarity)

        Note: local use only
        """
        if not isinstance(word, basestring):
            word = _convert_to_vector(word)
        words, similarity = self.call("findSynonyms", word, num)
        return zip(words, similarity)
    def predictSoft(self, x):
        """
        Find the membership of each point in 'x' to all mixture components.

        :param x:    RDD of data points.
        :return:     membership_matrix. RDD of array of double values.
        """
        if isinstance(x, RDD):
            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
            membership_matrix = callMLlibFunc("predictSoftGMM",
                                              x.map(_convert_to_vector),
                                              _convert_to_vector(self.weights),
                                              means, sigmas)
            return membership_matrix.map(lambda x: pyarray.array('d', x))
        else:
            raise TypeError("x should be represented by an RDD, "
                            "but got %s." % type(x))
Exemple #32
0
    def predict(self, x):
        """
        Find the cluster that each of the points belongs to in this
        model.

        :param x:
          A data point (or RDD of points) to determine cluster index.
        :return:
          Predicted cluster index or an RDD of predicted cluster indices
          if the input is an RDD.
        """
        if isinstance(x, RDD):
            vecs = x.map(_convert_to_vector)
            return self.call("predict", vecs)

        x = _convert_to_vector(x)
        return self.call("predict", x)
Exemple #33
0
    def predict(
        self, x: Union["VectorLike", RDD["VectorLike"]]
    ) -> Union[RDD[Union[int, float]], Union[int, float]]:
        """
        Predict values for a single data point or an RDD of points
        using the model trained.

        .. versionadded:: 0.9.0
        """
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict(v))

        x = _convert_to_vector(x)
        if self.numClasses == 2:
            margin = self.weights.dot(x) + self._intercept  # type: ignore[attr-defined]
            if margin > 0:
                prob = 1 / (1 + exp(-margin))
            else:
                exp_margin = exp(margin)
                prob = exp_margin / (1 + exp_margin)
            if self._threshold is None:
                return prob
            else:
                return 1 if prob > self._threshold else 0
        else:
            assert self._weightsMatrix is not None

            best_class = 0
            max_margin = 0.0
            if x.size + 1 == self._dataWithBiasSize:  # type: ignore[attr-defined]
                for i in range(0, self._numClasses - 1):
                    margin = (
                        x.dot(self._weightsMatrix[i][0 : x.size])  # type: ignore[attr-defined]
                        + self._weightsMatrix[i][x.size]  # type: ignore[attr-defined]
                    )
                    if margin > max_margin:
                        max_margin = margin
                        best_class = i + 1
            else:
                for i in range(0, self._numClasses - 1):
                    margin = x.dot(self._weightsMatrix[i])  # type: ignore[attr-defined]
                    if margin > max_margin:
                        max_margin = margin
                        best_class = i + 1
            return best_class
Exemple #34
0
    def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]:
        """
        Predict values for a single data point or an RDD of points using
        the model trained.

        .. versionadded:: 1.3.0

        Notes
        -----
        In Python, predict cannot currently be used within an RDD
        transformation or action.
        Call predict directly on the RDD instead.
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))
Exemple #35
0
    def predict(
        self, x: Union["VectorLike", RDD["VectorLike"]]
    ) -> Union[RDD[Union[int, float]], Union[int, float]]:
        """
        Predict values for a single data point or an RDD of points
        using the model trained.

        .. versionadded:: 0.9.0
        """
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict(v))

        x = _convert_to_vector(x)
        margin = self.weights.dot(x) + self.intercept  # type: ignore[attr-defined]
        if self._threshold is None:
            return margin
        else:
            return 1 if margin > self._threshold else 0
Exemple #36
0
    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points using
        the model trained.
        """
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict(v))

        x = _convert_to_vector(x)
        margin = self.weights.dot(x) + self._intercept
        if margin > 0:
            prob = 1 / (1 + exp(-margin))
        else:
            exp_margin = exp(margin)
            prob = exp_margin / (1 + exp_margin)
        if self._threshold is None:
            return prob
        else:
            return 1 if prob > self._threshold else 0
Exemple #37
0
    def computeCost(self, rdd):
        """
        Return the K-means cost (sum of squared distances of points to
        their nearest center) for this model on the given
        data.

        .. versionadded:: 1.4.0

        Parameters
        ----------
        rdd : ::py:class:`pyspark.RDD`
            The RDD of points to compute the cost on.
        """
        cost = callMLlibFunc(
            "computeCostKmeansModel",
            rdd.map(_convert_to_vector),
            [_convert_to_vector(c) for c in self.centers],
        )
        return cost
Exemple #38
0
    def transform(self, vector):
        """
        Applies transformation on a vector or an RDD[Vector].

        Parameters
        ----------
        vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            Input vector(s) to be transformed.

        Notes
        -----
        In Python, transform cannot currently be used within
        an RDD transformation or action.
        Call transform directly on the RDD instead.
        """
        if isinstance(vector, RDD):
            vector = vector.map(_convert_to_vector)
        else:
            vector = _convert_to_vector(vector)
        return self.call("transform", vector)
    def predict(self, x):
        """
        Find the cluster that each of the points belongs to in this
        model.

        :param x: the point (or RDD of points) to determine
            compute the clusters for.
        """
        best = 0
        best_distance = float("inf")
        if isinstance(x, RDD):
            return x.map(self.predict)

        x = _convert_to_vector(x)
        for i in xrange(len(self.centers)):
            distance = x.squared_distance(self.centers[i])
            if distance < best_distance:
                best = i
                best_distance = distance
        return best
Exemple #40
0
    def computeCost(self, x):
        """
        Return the Bisecting K-means cost (sum of squared distances of
        points to their nearest center) for this model on the given
        data. If provided with an RDD of points returns the sum.

        .. versionadded:: 2.0.0

        Parameters
        ----------
        point : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            A data point (or RDD of points) to compute the cost(s).
            :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent
            objects (list, tuple, numpy.ndarray).
        """
        if isinstance(x, RDD):
            vecs = x.map(_convert_to_vector)
            return self.call("computeCost", vecs)

        return self.call("computeCost", _convert_to_vector(x))
Exemple #41
0
    def findSynonyms(self, x, num):
        """
        :param x: a word or a vector representation of word
        :param num: number of synonyms to find
        :return: array of (word, cosineSimilarity)

        Find synonyms of a word

        Note: local use only
        """
        # TODO: make findSynonyms usable in RDD operations from python side
        ser = PickleSerializer()
        if type(x) == str:
            jlist = self._java_model.findSynonyms(x, num)
        else:
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            jlist = self._java_model.findSynonyms(vec, num)
        words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist)))
        return zip(words, similarity)
    def transform(self, x):
        """
        Transforms term frequency (TF) vectors to TF-IDF vectors.

        If `minDocFreq` was set for the IDF calculation,
        the terms which occur in fewer than `minDocFreq`
        documents will have an entry of 0.

        Note: In Python, transform cannot currently be used within
              an RDD transformation or action.
              Call transform directly on the RDD instead.

        :param x: an RDD of term frequency vectors or a term frequency
                 vector
        :return: an RDD of TF-IDF vectors or a TF-IDF vector
        """
        if isinstance(x, RDD):
            return JavaVectorTransformer.transform(self, x)

        x = _convert_to_vector(x)
        return JavaVectorTransformer.transform(self, x)
Exemple #43
0
    def transform(self, vector):
        """
        Applies unit length normalization on a vector.

        .. versionadded:: 1.2.0

        Parameters
        ----------
        vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            vector or RDD of vector to be normalized.

        Returns
        -------
        :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            normalized vector(s). If the norm of the input is zero, it
            will return the input vector.
        """
        if isinstance(vector, RDD):
            vector = vector.map(_convert_to_vector)
        else:
            vector = _convert_to_vector(vector)
        return callMLlibFunc("normalizeVector", self.p, vector)
Exemple #44
0
    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points
        using the model trained.
        """
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict(v))

        x = _convert_to_vector(x)
        if self.numClasses == 2:
            margin = self.weights.dot(x) + self._intercept
            if margin > 0:
                prob = 1 / (1 + exp(-margin))
            else:
                exp_margin = exp(margin)
                prob = exp_margin / (1 + exp_margin)
            if self._threshold is None:
                return prob
            else:
                return 1 if prob > self._threshold else 0
        else:
            best_class = 0
            max_margin = 0.0
            if x.size + 1 == self._dataWithBiasSize:
                for i in range(0, self._numClasses - 1):
                    margin = (
                        x.dot(self._weightsMatrix[i][0 : x.size]) + self._weightsMatrix[i][x.size]
                    )
                    if margin > max_margin:
                        max_margin = margin
                        best_class = i + 1
            else:
                for i in range(0, self._numClasses - 1):
                    margin = x.dot(self._weightsMatrix[i])
                    if margin > max_margin:
                        max_margin = margin
                        best_class = i + 1
            return best_class
Exemple #45
0
    def update(self, data, decayFactor, timeUnit):
        """Update the centroids, according to data

        :param data: Should be a RDD that represents the new data.
        :param decayFactor: forgetfulness of the previous centroids.
        :param timeUnit: Can be "batches" or "points". If points, then the
                         decay factor is raised to the power of number of new
                         points and if batches, it is used as it is.
        """
        if not isinstance(data, RDD):
            raise TypeError("Data should be of an RDD, got %s." % type(data))
        data = data.map(_convert_to_vector)
        decayFactor = float(decayFactor)
        if timeUnit not in ["batches", "points"]:
            raise ValueError(
                "timeUnit should be 'batches' or 'points', got %s." % timeUnit)
        vectorCenters = [_convert_to_vector(center) for center in self.centers]
        updatedModel = callMLlibFunc("updateStreamingKMeansModel",
                                     vectorCenters, self._clusterWeights, data,
                                     decayFactor, timeUnit)
        self.centers = array(updatedModel[0])
        self._clusterWeights = list(updatedModel[1])
        return self
Exemple #46
0
    def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]:
        """
        Predict the label of one or more examples.

        .. versionadded:: 1.1.0

        Parameters
        ----------
        x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            Data point (feature vector), or an RDD of data points (feature
            vectors).

        Notes
        -----
        In Python, predict cannot currently be used within an RDD
        transformation or action.
        Call predict directly on the RDD instead.
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))
Exemple #47
0
    def predict(self, x):
        """
        Predict the label of one or more examples.
        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD()
            jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)
Exemple #48
0
    def update(self, data: RDD["VectorLike"], decayFactor: float,
               timeUnit: str) -> "StreamingKMeansModel":
        """Update the centroids, according to data

        .. versionadded:: 1.5.0

        Parameters
        ----------
        data : :py:class:`pyspark.RDD`
            RDD with new data for the model update.
        decayFactor : float
            Forgetfulness of the previous centroids.
        timeUnit :  str
            Can be "batches" or "points". If points, then the decay factor
            is raised to the power of number of new points and if batches,
            then decay factor will be used as is.
        """
        if not isinstance(data, RDD):
            raise TypeError("Data should be of an RDD, got %s." % type(data))
        data = data.map(_convert_to_vector)
        decayFactor = float(decayFactor)
        if timeUnit not in ["batches", "points"]:
            raise ValueError(
                "timeUnit should be 'batches' or 'points', got %s." % timeUnit)
        vectorCenters = [_convert_to_vector(center) for center in self.centers]
        updatedModel = callMLlibFunc(
            "updateStreamingKMeansModel",
            vectorCenters,
            self._clusterWeights,
            data,
            decayFactor,
            timeUnit,
        )
        self.centers = array(updatedModel[0])  # type: ignore[assignment]
        self._clusterWeights = list(updatedModel[1])
        return self
Exemple #49
0
 def __init__(self, scalingVector):
     self.scalingVector = _convert_to_vector(scalingVector)
Exemple #50
0
 def save(self, sc, path):
     java_centers = _py2java(sc,
                             [_convert_to_vector(c) for c in self.centers])
     java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(
         java_centers)
     java_model.save(sc._jsc.sc(), path)
Exemple #51
0
    def train(cls,
              rdd,
              k,
              maxIterations=100,
              initializationMode="k-means||",
              seed=None,
              initializationSteps=2,
              epsilon=1e-4,
              initialModel=None):
        """
        Train a k-means clustering model.

        .. versionadded:: 0.9.0

        Parameters
        ----------
        rdd : ::py:class:`pyspark.RDD`
            Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector`
            or convertible sequence types.
        k : int
            Number of clusters to create.
        maxIterations : int, optional
            Maximum number of iterations allowed.
            (default: 100)
        initializationMode : str, optional
            The initialization algorithm. This can be either "random" or
            "k-means||".
            (default: "k-means||")
        seed : int, optional
            Random seed value for cluster initialization. Set as None to
            generate seed based on system time.
            (default: None)
        initializationSteps :
            Number of steps for the k-means|| initialization mode.
            This is an advanced setting -- the default of 2 is almost
            always enough.
            (default: 2)
        epsilon : float, optional
            Distance threshold within which a center will be considered to
            have converged. If all centers move less than this Euclidean
            distance, iterations are stopped.
            (default: 1e-4)
        initialModel : :py:class:`KMeansModel`, optional
            Initial cluster centers can be provided as a KMeansModel object
            rather than using the random or k-means|| initializationModel.
            (default: None)
        """
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise TypeError("initialModel is of " +
                                str(type(initialModel)) + ". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [
                _convert_to_vector(c) for c in initialModel.clusterCenters
            ]
        model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector),
                              k, maxIterations, initializationMode, seed,
                              initializationSteps, epsilon,
                              clusterInitialModel)
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])
Exemple #52
0
 def __init__(self, weights, intercept):
     self._coeff = _convert_to_vector(weights)
     self._intercept = float(intercept)
Exemple #53
0
    def chiSqTest(observed, expected=None):
        """
        .. note:: Experimental

        If `observed` is Vector, conduct Pearson's chi-squared goodness
        of fit test of the observed data against the expected distribution,
        or againt the uniform distribution (by default), with each category
        having an expected frequency of `1 / len(observed)`.
        (Note: `observed` cannot contain negative values)

        If `observed` is matrix, conduct Pearson's independence test on the
        input contingency matrix, which cannot contain negative entries or
        columns or rows that sum up to 0.

        If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
        test for every feature against the label across the input RDD.
        For each feature, the (feature, label) pairs are converted into a
        contingency matrix for which the chi-squared statistic is computed.
        All label and feature values must be categorical.

        :param observed: it could be a vector containing the observed categorical
                         counts/relative frequencies, or the contingency matrix
                         (containing either counts or relative frequencies),
                         or an RDD of LabeledPoint containing the labeled dataset
                         with categorical features. Real-valued features will be
                         treated as categorical for each distinct value.
        :param expected: Vector containing the expected categorical counts/relative
                         frequencies. `expected` is rescaled if the `expected` sum
                         differs from the `observed` sum.
        :return: ChiSquaredTest object containing the test statistic, degrees
                 of freedom, p-value, the method used, and the null hypothesis.

        >>> from pyspark.mllib.linalg import Vectors, Matrices
        >>> observed = Vectors.dense([4, 6, 5])
        >>> pearson = Statistics.chiSqTest(observed)
        >>> print pearson.statistic
        0.4
        >>> pearson.degreesOfFreedom
        2
        >>> print round(pearson.pValue, 4)
        0.8187
        >>> pearson.method
        u'pearson'
        >>> pearson.nullHypothesis
        u'observed follows the same distribution as expected.'

        >>> observed = Vectors.dense([21, 38, 43, 80])
        >>> expected = Vectors.dense([3, 5, 7, 20])
        >>> pearson = Statistics.chiSqTest(observed, expected)
        >>> print round(pearson.pValue, 4)
        0.0027

        >>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        >>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
        >>> print round(chi.statistic, 4)
        21.9958

        >>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
        ...         LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
        ...         LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
        ...         LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
        ...         LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
        ...         LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),]
        >>> rdd = sc.parallelize(data, 4)
        >>> chi = Statistics.chiSqTest(rdd)
        >>> print chi[0].statistic
        0.75
        >>> print chi[1].statistic
        1.5
        """
        if isinstance(observed, RDD):
            if not isinstance(observed.first(), LabeledPoint):
                raise ValueError("observed should be an RDD of LabeledPoint")
            jmodels = callMLlibFunc("chiSqTest", observed)
            return [ChiSqTestResult(m) for m in jmodels]

        if isinstance(observed, Matrix):
            jmodel = callMLlibFunc("chiSqTest", observed)
        else:
            if expected and len(expected) != len(observed):
                raise ValueError("`expected` should have same length with `observed`")
            jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
        return ChiSqTestResult(jmodel)
Exemple #54
0
 def __init__(self, label, features):
     self.label = float(label)
     self.features = _convert_to_vector(features)
Exemple #55
0
    def train(cls,
              rdd,
              k,
              maxIterations=100,
              runs=1,
              initializationMode="k-means||",
              seed=None,
              initializationSteps=2,
              epsilon=1e-4,
              initialModel=None):
        """
        Train a k-means clustering model.

        :param rdd:
          Training points as an `RDD` of `Vector` or convertible
          sequence types.
        :param k:
          Number of clusters to create.
        :param maxIterations:
          Maximum number of iterations allowed.
          (default: 100)
        :param runs:
          This param has no effect since Spark 2.0.0.
        :param initializationMode:
          The initialization algorithm. This can be either "random" or
          "k-means||".
          (default: "k-means||")
        :param seed:
          Random seed value for cluster initialization. Set as None to
          generate seed based on system time.
          (default: None)
        :param initializationSteps:
          Number of steps for the k-means|| initialization mode.
          This is an advanced setting -- the default of 2 is almost
          always enough.
          (default: 2)
        :param epsilon:
          Distance threshold within which a center will be considered to
          have converged. If all centers move less than this Euclidean
          distance, iterations are stopped.
          (default: 1e-4)
        :param initialModel:
          Initial cluster centers can be provided as a KMeansModel object
          rather than using the random or k-means|| initializationModel.
          (default: None)
        """
        if runs != 1:
            warnings.warn("The param `runs` has no effect since Spark 2.0.0.")
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise Exception("initialModel is of " +
                                str(type(initialModel)) + ". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [
                _convert_to_vector(c) for c in initialModel.clusterCenters
            ]
        model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector),
                              k, maxIterations, runs, initializationMode, seed,
                              initializationSteps, epsilon,
                              clusterInitialModel)
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])
Exemple #56
0
 def transform(self, vector):
     if isinstance(vector, RDD):
         vector = vector.map(_convert_to_vector)
     else:
         vector = _convert_to_vector(vector)
     return self.call("transform", vector)
Exemple #57
0
 def __init__(self, index, vector):
     self.index = long(index)
     self.vector = _convert_to_vector(vector)
Exemple #58
0
 def serialize(l):
     return ser.loads(ser.dumps(_convert_to_vector(l)))
Exemple #59
0
 def predict(self, x):
     """Return the most likely class for a data vector x"""
     x = _convert_to_vector(x)
     return self.labels[numpy.argmax(self.pi +
                                     x.dot(self.theta.transpose()))]
Exemple #60
0
 def predict(self, x):
     x = _convert_to_vector(x)
     margin = self.weights.dot(x) + self.intercept
     return 1 if margin >= 0 else 0