Python _convert_to_vector Examples, pyspark.mllib.linalg._convert_to_vector Python Examples

Example #1

0

Show file

File: util.py Project: AsafZ/spark

 def appendBias(data):
     """
     Returns a new vector with `1.0` (bias) appended to
     the end of the input vector.
     """
     vec = _convert_to_vector(data)
     if isinstance(vec, SparseVector):
         newIndices = np.append(vec.indices, len(vec))
         newValues = np.append(vec.values, 1.0)
         return SparseVector(len(vec) + 1, newIndices, newValues)
     else:
         return _convert_to_vector(np.append(vec.toArray(), 1.0))

Example #2

0

Show file

File: regression.py Project: BeforeRain/spark

def _regression_train_wrapper(train_func, modelClass, data, initial_weights):
    from pyspark.mllib.classification import LogisticRegressionModel
    first = data.first()
    if not isinstance(first, LabeledPoint):
        raise TypeError("data should be an RDD of LabeledPoint, but got %s" % type(first))
    if initial_weights is None:
        initial_weights = [0.0] * len(data.first().features)
    if (modelClass == LogisticRegressionModel):
        weights, intercept, numFeatures, numClasses = train_func(
            data, _convert_to_vector(initial_weights))
        return modelClass(weights, intercept, numFeatures, numClasses)
    else:
        weights, intercept = train_func(data, _convert_to_vector(initial_weights))
        return modelClass(weights, intercept)

Example #3

0

Show file

File: clustering.py Project: Raynes/spark

    def predictSoft(self, x):
        """
        Find the membership of point 'x' or each point in RDD 'x' to all mixture components.

        :param x:    vector or RDD of vector represents data points.
        :return:     the membership value to all mixture components for vector 'x'
                     or each vector in RDD 'x'.
        """
        if isinstance(x, RDD):
            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
            membership_matrix = callMLlibFunc(
                "predictSoftGMM", x.map(_convert_to_vector), _convert_to_vector(self.weights), means, sigmas
            )
            return membership_matrix.map(lambda x: pyarray.array("d", x))
        else:
            return self.call("predictSoft", _convert_to_vector(x)).toArray()

Example #4

0

Show file

File: clustering.py Project: 11wzy001/spark

 def save(self, sc, path):
     """
     Save this model to the given path.
     """
     java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers])
     java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
     java_model.save(sc._jsc.sc(), path)

Example #5

0

Show file

File: clustering.py Project: BeforeRain/spark

 def train(
     cls,
     rdd,
     k,
     maxIterations=100,
     runs=1,
     initializationMode="k-means||",
     seed=None,
     initializationSteps=5,
     epsilon=1e-4,
     initialModel=None,
 ):
     """Train a k-means clustering model."""
     clusterInitialModel = []
     if initialModel is not None:
         if not isinstance(initialModel, KMeansModel):
             raise Exception(
                 "initialModel is of " + str(type(initialModel)) + ". It needs " "to be of <type 'KMeansModel'>"
             )
         clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
     model = callMLlibFunc(
         "trainKMeansModel",
         rdd.map(_convert_to_vector),
         k,
         maxIterations,
         runs,
         initializationMode,
         seed,
         initializationSteps,
         epsilon,
         clusterInitialModel,
     )
     centers = callJavaFunc(rdd.context, model.clusterCenters)
     return KMeansModel([c.toArray() for c in centers])

Example #6

0

Show file

File: clustering.py Project: 11wzy001/spark

    def update(self, data, decayFactor, timeUnit):
        """Update the centroids, according to data

        :param data:
          RDD with new data for the model update.
        :param decayFactor:
          Forgetfulness of the previous centroids.
        :param timeUnit:
          Can be "batches" or "points". If points, then the decay factor
          is raised to the power of number of new points and if batches,
          then decay factor will be used as is.
        """
        if not isinstance(data, RDD):
            raise TypeError("Data should be of an RDD, got %s." % type(data))
        data = data.map(_convert_to_vector)
        decayFactor = float(decayFactor)
        if timeUnit not in ["batches", "points"]:
            raise ValueError(
                "timeUnit should be 'batches' or 'points', got %s." % timeUnit)
        vectorCenters = [_convert_to_vector(center) for center in self.centers]
        updatedModel = callMLlibFunc(
            "updateStreamingKMeansModel", vectorCenters, self._clusterWeights,
            data, decayFactor, timeUnit)
        self.centers = array(updatedModel[0])
        self._clusterWeights = list(updatedModel[1])
        return self

Example #7

0

Show file

File: tree.py Project: 312268112/spark

    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD()
            jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)

Example #8

0

Show file

File: regression.py Project: 31z4/spark

 def predict(self, x):
     """
     Predict the value of the dependent variable given a vector x
     containing values for the independent variables.
     """
     x = _convert_to_vector(x)
     return self.weights.dot(x) + self.intercept

Example #9

0

Show file

File: regression.py Project: dnprock/spark

def _regression_train_wrapper(train_func, modelClass, data, initial_weights):
    first = data.first()
    if not isinstance(first, LabeledPoint):
        raise ValueError("data should be an RDD of LabeledPoint, but got %s" % first)
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    weights, intercept = train_func(data, _convert_to_vector(initial_weights))
    return modelClass(weights, intercept)

Example #10

0

Show file

File: clustering.py Project: GuoNing89/Study

 def computeCost(self, rdd):
     """
     Return the K-means cost (sum of squared distances of points to
     their nearest center) for this model on the given data.
     """
     cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector),
                          [_convert_to_vector(c) for c in self.centers])
     return cost

Example #11

0

Show file

File: test_linalg.py Project: drewrobb/spark

    def test_serialize(self):
        from scipy.sparse import lil_matrix
        lil = lil_matrix((4, 1))
        lil[1, 0] = 1
        lil[3, 0] = 2
        sv = SparseVector(4, {1: 1, 3: 2})
        self.assertEqual(sv, _convert_to_vector(lil))
        self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
        self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
        self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
        self.assertEqual(sv, _convert_to_vector(lil.todok()))

        def serialize(l):
            return ser.loads(ser.dumps(_convert_to_vector(l)))
        self.assertEqual(sv, serialize(lil))
        self.assertEqual(sv, serialize(lil.tocsc()))
        self.assertEqual(sv, serialize(lil.tocsr()))
        self.assertEqual(sv, serialize(lil.todok()))

Example #12

0

Show file

File: classification.py Project: aman010/spark

 def predict(self, x):
     x = _convert_to_vector(x)
     margin = self.weights.dot(x) + self._intercept
     if margin > 0:
         prob = 1 / (1 + exp(-margin))
     else:
         exp_margin = exp(margin)
         prob = exp_margin / (1 + exp_margin)
     return 1 if prob > 0.5 else 0

Example #13

0

Show file

File: regression.py Project: BeforeRain/spark

    def setInitialWeights(self, initialWeights):
        """
        Set the initial value of weights.

        This must be set before running trainOn and predictOn
        """
        initialWeights = _convert_to_vector(initialWeights)
        self._model = LinearRegressionModel(initialWeights, 0)
        return self

Example #14

0

Show file

File: regression.py Project: BeforeRain/spark

 def predict(self, x):
     """
     Predict the value of the dependent variable given a vector or
     an RDD of vectors containing values for the independent variables.
     """
     if isinstance(x, RDD):
         return x.map(self.predict)
     x = _convert_to_vector(x)
     return self.weights.dot(x) + self.intercept

Example #15

0

Show file

File: classification.py Project: vijaykiran/spark

 def predict(self, x):
     """
     Return the most likely class for a data vector
     or an RDD of vectors
     """
     if isinstance(x, RDD):
         return x.map(lambda v: self.predict(v))
     x = _convert_to_vector(x)
     return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]

Example #16

0

Show file

File: mlbayes.py Project: AlexFridman/Multi-label-classification-with-spark

 def predict_all(self, x):
     if isinstance(x, RDD):
         return x.map(lambda v: self.predict_all(v))
     x = _convert_to_vector(x)
     log_probs = self.pi + x.dot(self.theta.transpose())
     scaled_log_probs = scale(log_probs)
     int_lables = [int(l_i) for l_i in self.labels]
     labels_and_log_probs = zip(int_lables, scaled_log_probs)
     return sorted(labels_and_log_probs, key=lambda x: x[1], reverse=True)

Example #17

0

Show file

File: test_linalg.py Project: drewrobb/spark

 def test_convert_to_vector(self):
     from scipy.sparse import csc_matrix
     # Create a CSC matrix with non-sorted indices
     indptr = array([0, 2])
     indices = array([3, 1])
     data = array([2.0, 1.0])
     csc = csc_matrix((data, indices, indptr))
     self.assertFalse(csc.has_sorted_indices)
     sv = SparseVector(4, {1: 1, 3: 2})
     self.assertEqual(sv, _convert_to_vector(csc))

Example #18

0

Show file

File: feature.py Project: ChenZhongPu/Simba

    def transform(self, vector):
        """
        Computes the Hadamard product of the vector.
        """
        if isinstance(vector, RDD):
            vector = vector.map(_convert_to_vector)

        else:
            vector = _convert_to_vector(vector)
        return callMLlibFunc("elementwiseProductVector", self.scalingVector, vector)

Example #19

0

Show file

File: tree.py Project: Liuchang0812/spark

    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points using
        the model trained.
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))

Example #20

0

Show file

File: feature.py Project: alope107/spark

 def findSynonyms(self, word, num):
     """
     Find "num" number of words closest in similarity to "word".
     word can be a string or vector representation.
     Returns a dataframe with two fields word and similarity (which
     gives the cosine similarity).
     """
     if not isinstance(word, basestring):
         word = _convert_to_vector(word)
     return self._call_java("findSynonyms", word, num)

Example #21

0

Show file

File: classification.py Project: ChaiBapchya/spark

    def setInitialWeights(self, initialWeights):
        """
        Set the initial value of weights.

        This must be set before running trainOn and predictOn.
        """
        initialWeights = _convert_to_vector(initialWeights)

        # LogisticRegressionWithSGD does only binary classification.
        self._model = LogisticRegressionModel(initialWeights, 0, initialWeights.size, 2)
        return self

Example #22

0

Show file

File: clustering.py Project: wso2/wso2-spark

 def predict(self, x):
     """Find the cluster to which x belongs in this model."""
     best = 0
     best_distance = float("inf")
     x = _convert_to_vector(x)
     for i in xrange(len(self.centers)):
         distance = x.squared_distance(self.centers[i])
         if distance < best_distance:
             best = i
             best_distance = distance
     return best

Example #23

0

Show file

File: clustering.py Project: 91Phoenix/spark

    def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
              seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None):
        """
        Train a k-means clustering model.

        :param rdd:
          Training points as an `RDD` of `Vector` or convertible
          sequence types.
        :param k:
          Number of clusters to create.
        :param maxIterations:
          Maximum number of iterations allowed.
          (default: 100)
        :param runs:
          Number of runs to execute in parallel. The best model according
          to the cost function will be returned (deprecated in 1.6.0).
          (default: 1)
        :param initializationMode:
          The initialization algorithm. This can be either "random" or
          "k-means||".
          (default: "k-means||")
        :param seed:
          Random seed value for cluster initialization. Set as None to
          generate seed based on system time.
          (default: None)
        :param initializationSteps:
          Number of steps for the k-means|| initialization mode.
          This is an advanced setting -- the default of 5 is almost
          always enough.
          (default: 5)
        :param epsilon:
          Distance threshold within which a center will be considered to
          have converged. If all centers move less than this Euclidean
          distance, iterations are stopped.
          (default: 1e-4)
        :param initialModel:
          Initial cluster centers can be provided as a KMeansModel object
          rather than using the random or k-means|| initializationModel.
          (default: None)
        """
        if runs != 1:
            warnings.warn(
                "Support for runs is deprecated in 1.6.0. This param will have no effect in 2.0.0.")
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise Exception("initialModel is of "+str(type(initialModel))+". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
        model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations,
                              runs, initializationMode, seed, initializationSteps, epsilon,
                              clusterInitialModel)
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])

Example #24

0

Show file

File: regression.py Project: 312268112/spark

def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights):
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    ser = PickleSerializer()
    initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights)))
    # use AutoBatchedSerializer before cache to reduce the memory
    # overhead in JVM
    cached = data._reserialize(AutoBatchedSerializer(ser)).cache()
    ans = train_func(cached._to_java_object_rdd(), initial_bytes)
    assert len(ans) == 2, "JVM call result had unexpected length"
    weights = ser.loads(str(ans[0]))
    return modelClass(weights, ans[1])

Example #25

0

Show file

File: clustering.py Project: 1ambda/spark

    def predictSoft(self, x):
        """
        Find the membership of each point in 'x' to all mixture components.

        :param x:    RDD of data points.
        :return:     membership_matrix. RDD of array of double values.
        """
        if isinstance(x, RDD):
            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
            membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector),
                                              _convert_to_vector(self.weights), means, sigmas)
            return membership_matrix.map(lambda x: pyarray.array('d', x))

Example #26

0

Show file

File: tree.py Project: Liuchang0812/spark

    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))

Example #27

0

Show file

File: feature.py Project: ChenZhongPu/Simba

    def transform(self, vector):
        """
        Applies unit length normalization on a vector.

        :param vector: vector or RDD of vector to be normalized.
        :return: normalized vector. If the norm of the input is zero, it
                 will return the input vector.
        """
        if isinstance(vector, RDD):
            vector = vector.map(_convert_to_vector)
        else:
            vector = _convert_to_vector(vector)
        return callMLlibFunc("normalizeVector", self.p, vector)

Example #28

0

Show file

File: util.py Project: BViki/spark

 def _convert_labeled_point_to_libsvm(p):
     """Converts a LabeledPoint to a string in LIBSVM format."""
     assert isinstance(p, LabeledPoint)
     items = [str(p.label)]
     v = _convert_to_vector(p.features)
     if isinstance(v, SparseVector):
         nnz = len(v.indices)
         for i in xrange(nnz):
             items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
     else:
         for i in xrange(len(v)):
             items.append(str(i + 1) + ":" + str(v[i]))
     return " ".join(items)

Example #29

0

Show file

File: clustering.py Project: tyronecai/spark

    def computeCost(self, x):
        """
        Return the Bisecting K-means cost (sum of squared distances of
        points to their nearest center) for this model on the given
        data. If provided with an RDD of points returns the sum.

        :param point: the point or RDD of points to compute the cost(s).
        """
        if isinstance(x, RDD):
            vecs = x.map(_convert_to_vector)
            return self.call("computeCost", vecs)

        return self.call("computeCost", _convert_to_vector(x))

Example #30

0

Show file

File: feature.py Project: ChenZhongPu/Simba

    def findSynonyms(self, word, num):
        """
        Find synonyms of a word

        :param word: a word or a vector representation of word
        :param num: number of synonyms to find
        :return: array of (word, cosineSimilarity)

        Note: local use only
        """
        if not isinstance(word, basestring):
            word = _convert_to_vector(word)
        words, similarity = self.call("findSynonyms", word, num)
        return zip(words, similarity)

Example #31

0

Show file

File: clustering.py Project: bopopescu/spark-1.6.0-cdh5.12.0

    def predictSoft(self, x):
        """
        Find the membership of each point in 'x' to all mixture components.

        :param x:    RDD of data points.
        :return:     membership_matrix. RDD of array of double values.
        """
        if isinstance(x, RDD):
            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
            membership_matrix = callMLlibFunc("predictSoftGMM",
                                              x.map(_convert_to_vector),
                                              _convert_to_vector(self.weights),
                                              means, sigmas)
            return membership_matrix.map(lambda x: pyarray.array('d', x))
        else:
            raise TypeError("x should be represented by an RDD, "
                            "but got %s." % type(x))

Example #32

0

Show file

File: clustering.py Project: DavideB/xSpark

    def predict(self, x):
        """
        Find the cluster that each of the points belongs to in this
        model.

        :param x:
          A data point (or RDD of points) to determine cluster index.
        :return:
          Predicted cluster index or an RDD of predicted cluster indices
          if the input is an RDD.
        """
        if isinstance(x, RDD):
            vecs = x.map(_convert_to_vector)
            return self.call("predict", vecs)

        x = _convert_to_vector(x)
        return self.call("predict", x)

Example #33

0

Show file

File: classification.py Project: zhengruifeng/spark

    def predict(
        self, x: Union["VectorLike", RDD["VectorLike"]]
    ) -> Union[RDD[Union[int, float]], Union[int, float]]:
        """
        Predict values for a single data point or an RDD of points
        using the model trained.

        .. versionadded:: 0.9.0
        """
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict(v))

        x = _convert_to_vector(x)
        if self.numClasses == 2:
            margin = self.weights.dot(x) + self._intercept  # type: ignore[attr-defined]
            if margin > 0:
                prob = 1 / (1 + exp(-margin))
            else:
                exp_margin = exp(margin)
                prob = exp_margin / (1 + exp_margin)
            if self._threshold is None:
                return prob
            else:
                return 1 if prob > self._threshold else 0
        else:
            assert self._weightsMatrix is not None

            best_class = 0
            max_margin = 0.0
            if x.size + 1 == self._dataWithBiasSize:  # type: ignore[attr-defined]
                for i in range(0, self._numClasses - 1):
                    margin = (
                        x.dot(self._weightsMatrix[i][0 : x.size])  # type: ignore[attr-defined]
                        + self._weightsMatrix[i][x.size]  # type: ignore[attr-defined]
                    )
                    if margin > max_margin:
                        max_margin = margin
                        best_class = i + 1
            else:
                for i in range(0, self._numClasses - 1):
                    margin = x.dot(self._weightsMatrix[i])  # type: ignore[attr-defined]
                    if margin > max_margin:
                        max_margin = margin
                        best_class = i + 1
            return best_class

Example #34

0

Show file

File: tree.py Project: zhengruifeng/spark

    def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]:
        """
        Predict values for a single data point or an RDD of points using
        the model trained.

        .. versionadded:: 1.3.0

        Notes
        -----
        In Python, predict cannot currently be used within an RDD
        transformation or action.
        Call predict directly on the RDD instead.
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))

Example #35

0

Show file

File: classification.py Project: zhengruifeng/spark

    def predict(
        self, x: Union["VectorLike", RDD["VectorLike"]]
    ) -> Union[RDD[Union[int, float]], Union[int, float]]:
        """
        Predict values for a single data point or an RDD of points
        using the model trained.

        .. versionadded:: 0.9.0
        """
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict(v))

        x = _convert_to_vector(x)
        margin = self.weights.dot(x) + self.intercept  # type: ignore[attr-defined]
        if self._threshold is None:
            return margin
        else:
            return 1 if margin > self._threshold else 0

Example #36

0

Show file

File: classification.py Project: a770606860/database

    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points using
        the model trained.
        """
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict(v))

        x = _convert_to_vector(x)
        margin = self.weights.dot(x) + self._intercept
        if margin > 0:
            prob = 1 / (1 + exp(-margin))
        else:
            exp_margin = exp(margin)
            prob = exp_margin / (1 + exp_margin)
        if self._threshold is None:
            return prob
        else:
            return 1 if prob > self._threshold else 0

Example #37

0

Show file

File: clustering.py Project: zoelin7/spark

    def computeCost(self, rdd):
        """
        Return the K-means cost (sum of squared distances of points to
        their nearest center) for this model on the given
        data.

        .. versionadded:: 1.4.0

        Parameters
        ----------
        rdd : ::py:class:`pyspark.RDD`
            The RDD of points to compute the cost on.
        """
        cost = callMLlibFunc(
            "computeCostKmeansModel",
            rdd.map(_convert_to_vector),
            [_convert_to_vector(c) for c in self.centers],
        )
        return cost

Example #38

0

Show file

File: feature.py Project: yaooqinn/spark

    def transform(self, vector):
        """
        Applies transformation on a vector or an RDD[Vector].

        Parameters
        ----------
        vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            Input vector(s) to be transformed.

        Notes
        -----
        In Python, transform cannot currently be used within
        an RDD transformation or action.
        Call transform directly on the RDD instead.
        """
        if isinstance(vector, RDD):
            vector = vector.map(_convert_to_vector)
        else:
            vector = _convert_to_vector(vector)
        return self.call("transform", vector)

Example #39

0

Show file

File: clustering.py Project: project-asap/spark-nested

    def predict(self, x):
        """
        Find the cluster that each of the points belongs to in this
        model.

        :param x: the point (or RDD of points) to determine
            compute the clusters for.
        """
        best = 0
        best_distance = float("inf")
        if isinstance(x, RDD):
            return x.map(self.predict)

        x = _convert_to_vector(x)
        for i in xrange(len(self.centers)):
            distance = x.squared_distance(self.centers[i])
            if distance < best_distance:
                best = i
                best_distance = distance
        return best

Example #40

0

Show file

File: clustering.py Project: imback82/spark-4

    def computeCost(self, x):
        """
        Return the Bisecting K-means cost (sum of squared distances of
        points to their nearest center) for this model on the given
        data. If provided with an RDD of points returns the sum.

        .. versionadded:: 2.0.0

        Parameters
        ----------
        point : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            A data point (or RDD of points) to compute the cost(s).
            :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent
            objects (list, tuple, numpy.ndarray).
        """
        if isinstance(x, RDD):
            vecs = x.map(_convert_to_vector)
            return self.call("computeCost", vecs)

        return self.call("computeCost", _convert_to_vector(x))

Example #41

0

Show file

    def findSynonyms(self, x, num):
        """
        :param x: a word or a vector representation of word
        :param num: number of synonyms to find
        :return: array of (word, cosineSimilarity)

        Find synonyms of a word

        Note: local use only
        """
        # TODO: make findSynonyms usable in RDD operations from python side
        ser = PickleSerializer()
        if type(x) == str:
            jlist = self._java_model.findSynonyms(x, num)
        else:
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            jlist = self._java_model.findSynonyms(vec, num)
        words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist)))
        return zip(words, similarity)

Example #42

0

Show file

File: feature.py Project: zhangchj1990/spark1.3-Source

    def transform(self, x):
        """
        Transforms term frequency (TF) vectors to TF-IDF vectors.

        If `minDocFreq` was set for the IDF calculation,
        the terms which occur in fewer than `minDocFreq`
        documents will have an entry of 0.

        Note: In Python, transform cannot currently be used within
              an RDD transformation or action.
              Call transform directly on the RDD instead.

        :param x: an RDD of term frequency vectors or a term frequency
                 vector
        :return: an RDD of TF-IDF vectors or a TF-IDF vector
        """
        if isinstance(x, RDD):
            return JavaVectorTransformer.transform(self, x)

        x = _convert_to_vector(x)
        return JavaVectorTransformer.transform(self, x)

Example #43

0

Show file

File: feature.py Project: yaooqinn/spark

    def transform(self, vector):
        """
        Applies unit length normalization on a vector.

        .. versionadded:: 1.2.0

        Parameters
        ----------
        vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            vector or RDD of vector to be normalized.

        Returns
        -------
        :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            normalized vector(s). If the norm of the input is zero, it
            will return the input vector.
        """
        if isinstance(vector, RDD):
            vector = vector.map(_convert_to_vector)
        else:
            vector = _convert_to_vector(vector)
        return callMLlibFunc("normalizeVector", self.p, vector)

Example #44

0

Show file

File: classification.py Project: yaooqinn/spark

    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points
        using the model trained.
        """
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict(v))

        x = _convert_to_vector(x)
        if self.numClasses == 2:
            margin = self.weights.dot(x) + self._intercept
            if margin > 0:
                prob = 1 / (1 + exp(-margin))
            else:
                exp_margin = exp(margin)
                prob = exp_margin / (1 + exp_margin)
            if self._threshold is None:
                return prob
            else:
                return 1 if prob > self._threshold else 0
        else:
            best_class = 0
            max_margin = 0.0
            if x.size + 1 == self._dataWithBiasSize:
                for i in range(0, self._numClasses - 1):
                    margin = (
                        x.dot(self._weightsMatrix[i][0 : x.size]) + self._weightsMatrix[i][x.size]
                    )
                    if margin > max_margin:
                        max_margin = margin
                        best_class = i + 1
            else:
                for i in range(0, self._numClasses - 1):
                    margin = x.dot(self._weightsMatrix[i])
                    if margin > max_margin:
                        max_margin = margin
                        best_class = i + 1
            return best_class

Example #45

0

Show file

    def update(self, data, decayFactor, timeUnit):
        """Update the centroids, according to data

        :param data: Should be a RDD that represents the new data.
        :param decayFactor: forgetfulness of the previous centroids.
        :param timeUnit: Can be "batches" or "points". If points, then the
                         decay factor is raised to the power of number of new
                         points and if batches, it is used as it is.
        """
        if not isinstance(data, RDD):
            raise TypeError("Data should be of an RDD, got %s." % type(data))
        data = data.map(_convert_to_vector)
        decayFactor = float(decayFactor)
        if timeUnit not in ["batches", "points"]:
            raise ValueError(
                "timeUnit should be 'batches' or 'points', got %s." % timeUnit)
        vectorCenters = [_convert_to_vector(center) for center in self.centers]
        updatedModel = callMLlibFunc("updateStreamingKMeansModel",
                                     vectorCenters, self._clusterWeights, data,
                                     decayFactor, timeUnit)
        self.centers = array(updatedModel[0])
        self._clusterWeights = list(updatedModel[1])
        return self

Example #46

0

Show file

File: tree.py Project: zhengruifeng/spark

    def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]:
        """
        Predict the label of one or more examples.

        .. versionadded:: 1.1.0

        Parameters
        ----------
        x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
            Data point (feature vector), or an RDD of data points (feature
            vectors).

        Notes
        -----
        In Python, predict cannot currently be used within an RDD
        transformation or action.
        Call predict directly on the RDD instead.
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))

Example #47

0

Show file

File: tree.py Project: zhouheming/spark

    def predict(self, x):
        """
        Predict the label of one or more examples.
        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD()
            jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)

Example #48

0

Show file

File: clustering.py Project: wangyum/spark

    def update(self, data: RDD["VectorLike"], decayFactor: float,
               timeUnit: str) -> "StreamingKMeansModel":
        """Update the centroids, according to data

        .. versionadded:: 1.5.0

        Parameters
        ----------
        data : :py:class:`pyspark.RDD`
            RDD with new data for the model update.
        decayFactor : float
            Forgetfulness of the previous centroids.
        timeUnit :  str
            Can be "batches" or "points". If points, then the decay factor
            is raised to the power of number of new points and if batches,
            then decay factor will be used as is.
        """
        if not isinstance(data, RDD):
            raise TypeError("Data should be of an RDD, got %s." % type(data))
        data = data.map(_convert_to_vector)
        decayFactor = float(decayFactor)
        if timeUnit not in ["batches", "points"]:
            raise ValueError(
                "timeUnit should be 'batches' or 'points', got %s." % timeUnit)
        vectorCenters = [_convert_to_vector(center) for center in self.centers]
        updatedModel = callMLlibFunc(
            "updateStreamingKMeansModel",
            vectorCenters,
            self._clusterWeights,
            data,
            decayFactor,
            timeUnit,
        )
        self.centers = array(updatedModel[0])  # type: ignore[assignment]
        self._clusterWeights = list(updatedModel[1])
        return self

Example #49

0

Show file

File: feature.py Project: CGCL-codes/MURS

 def __init__(self, scalingVector):
     self.scalingVector = _convert_to_vector(scalingVector)

Example #50

0

Show file

 def save(self, sc, path):
     java_centers = _py2java(sc,
                             [_convert_to_vector(c) for c in self.centers])
     java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(
         java_centers)
     java_model.save(sc._jsc.sc(), path)

Example #51

0

Show file

File: clustering.py Project: imback82/spark-4

    def train(cls,
              rdd,
              k,
              maxIterations=100,
              initializationMode="k-means||",
              seed=None,
              initializationSteps=2,
              epsilon=1e-4,
              initialModel=None):
        """
        Train a k-means clustering model.

        .. versionadded:: 0.9.0

        Parameters
        ----------
        rdd : ::py:class:`pyspark.RDD`
            Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector`
            or convertible sequence types.
        k : int
            Number of clusters to create.
        maxIterations : int, optional
            Maximum number of iterations allowed.
            (default: 100)
        initializationMode : str, optional
            The initialization algorithm. This can be either "random" or
            "k-means||".
            (default: "k-means||")
        seed : int, optional
            Random seed value for cluster initialization. Set as None to
            generate seed based on system time.
            (default: None)
        initializationSteps :
            Number of steps for the k-means|| initialization mode.
            This is an advanced setting -- the default of 2 is almost
            always enough.
            (default: 2)
        epsilon : float, optional
            Distance threshold within which a center will be considered to
            have converged. If all centers move less than this Euclidean
            distance, iterations are stopped.
            (default: 1e-4)
        initialModel : :py:class:`KMeansModel`, optional
            Initial cluster centers can be provided as a KMeansModel object
            rather than using the random or k-means|| initializationModel.
            (default: None)
        """
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise TypeError("initialModel is of " +
                                str(type(initialModel)) + ". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [
                _convert_to_vector(c) for c in initialModel.clusterCenters
            ]
        model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector),
                              k, maxIterations, initializationMode, seed,
                              initializationSteps, epsilon,
                              clusterInitialModel)
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])

Example #52

0

Show file

File: regression.py Project: Ignalina/spark311

 def __init__(self, weights, intercept):
     self._coeff = _convert_to_vector(weights)
     self._intercept = float(intercept)

Example #53

0

Show file

File: _statistics.py Project: bopopescu/SparkNew

    def chiSqTest(observed, expected=None):
        """
        .. note:: Experimental

        If `observed` is Vector, conduct Pearson's chi-squared goodness
        of fit test of the observed data against the expected distribution,
        or againt the uniform distribution (by default), with each category
        having an expected frequency of `1 / len(observed)`.
        (Note: `observed` cannot contain negative values)

        If `observed` is matrix, conduct Pearson's independence test on the
        input contingency matrix, which cannot contain negative entries or
        columns or rows that sum up to 0.

        If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
        test for every feature against the label across the input RDD.
        For each feature, the (feature, label) pairs are converted into a
        contingency matrix for which the chi-squared statistic is computed.
        All label and feature values must be categorical.

        :param observed: it could be a vector containing the observed categorical
                         counts/relative frequencies, or the contingency matrix
                         (containing either counts or relative frequencies),
                         or an RDD of LabeledPoint containing the labeled dataset
                         with categorical features. Real-valued features will be
                         treated as categorical for each distinct value.
        :param expected: Vector containing the expected categorical counts/relative
                         frequencies. `expected` is rescaled if the `expected` sum
                         differs from the `observed` sum.
        :return: ChiSquaredTest object containing the test statistic, degrees
                 of freedom, p-value, the method used, and the null hypothesis.

        >>> from pyspark.mllib.linalg import Vectors, Matrices
        >>> observed = Vectors.dense([4, 6, 5])
        >>> pearson = Statistics.chiSqTest(observed)
        >>> print pearson.statistic
        0.4
        >>> pearson.degreesOfFreedom
        2
        >>> print round(pearson.pValue, 4)
        0.8187
        >>> pearson.method
        u'pearson'
        >>> pearson.nullHypothesis
        u'observed follows the same distribution as expected.'

        >>> observed = Vectors.dense([21, 38, 43, 80])
        >>> expected = Vectors.dense([3, 5, 7, 20])
        >>> pearson = Statistics.chiSqTest(observed, expected)
        >>> print round(pearson.pValue, 4)
        0.0027

        >>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        >>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
        >>> print round(chi.statistic, 4)
        21.9958

        >>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
        ...         LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
        ...         LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
        ...         LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
        ...         LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
        ...         LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),]
        >>> rdd = sc.parallelize(data, 4)
        >>> chi = Statistics.chiSqTest(rdd)
        >>> print chi[0].statistic
        0.75
        >>> print chi[1].statistic
        1.5
        """
        if isinstance(observed, RDD):
            if not isinstance(observed.first(), LabeledPoint):
                raise ValueError("observed should be an RDD of LabeledPoint")
            jmodels = callMLlibFunc("chiSqTest", observed)
            return [ChiSqTestResult(m) for m in jmodels]

        if isinstance(observed, Matrix):
            jmodel = callMLlibFunc("chiSqTest", observed)
        else:
            if expected and len(expected) != len(observed):
                raise ValueError("`expected` should have same length with `observed`")
            jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
        return ChiSqTestResult(jmodel)

Example #54

0

Show file

File: regression.py Project: Ignalina/spark311

 def __init__(self, label, features):
     self.label = float(label)
     self.features = _convert_to_vector(features)

Example #55

0

Show file

    def train(cls,
              rdd,
              k,
              maxIterations=100,
              runs=1,
              initializationMode="k-means||",
              seed=None,
              initializationSteps=2,
              epsilon=1e-4,
              initialModel=None):
        """
        Train a k-means clustering model.

        :param rdd:
          Training points as an `RDD` of `Vector` or convertible
          sequence types.
        :param k:
          Number of clusters to create.
        :param maxIterations:
          Maximum number of iterations allowed.
          (default: 100)
        :param runs:
          This param has no effect since Spark 2.0.0.
        :param initializationMode:
          The initialization algorithm. This can be either "random" or
          "k-means||".
          (default: "k-means||")
        :param seed:
          Random seed value for cluster initialization. Set as None to
          generate seed based on system time.
          (default: None)
        :param initializationSteps:
          Number of steps for the k-means|| initialization mode.
          This is an advanced setting -- the default of 2 is almost
          always enough.
          (default: 2)
        :param epsilon:
          Distance threshold within which a center will be considered to
          have converged. If all centers move less than this Euclidean
          distance, iterations are stopped.
          (default: 1e-4)
        :param initialModel:
          Initial cluster centers can be provided as a KMeansModel object
          rather than using the random or k-means|| initializationModel.
          (default: None)
        """
        if runs != 1:
            warnings.warn("The param `runs` has no effect since Spark 2.0.0.")
        clusterInitialModel = []
        if initialModel is not None:
            if not isinstance(initialModel, KMeansModel):
                raise Exception("initialModel is of " +
                                str(type(initialModel)) + ". It needs "
                                "to be of <type 'KMeansModel'>")
            clusterInitialModel = [
                _convert_to_vector(c) for c in initialModel.clusterCenters
            ]
        model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector),
                              k, maxIterations, runs, initializationMode, seed,
                              initializationSteps, epsilon,
                              clusterInitialModel)
        centers = callJavaFunc(rdd.context, model.clusterCenters)
        return KMeansModel([c.toArray() for c in centers])

Example #56

0

Show file

 def transform(self, vector):
     if isinstance(vector, RDD):
         vector = vector.map(_convert_to_vector)
     else:
         vector = _convert_to_vector(vector)
     return self.call("transform", vector)

Example #57

0

Show file

File: distributed.py Project: d3v3l0/snappy-spark

 def __init__(self, index, vector):
     self.index = long(index)
     self.vector = _convert_to_vector(vector)

Example #58

0

Show file

File: tests.py Project: bopopescu/SparkNew

 def serialize(l):
     return ser.loads(ser.dumps(_convert_to_vector(l)))

Example #59

0

Show file

 def predict(self, x):
     """Return the most likely class for a data vector x"""
     x = _convert_to_vector(x)
     return self.labels[numpy.argmax(self.pi +
                                     x.dot(self.theta.transpose()))]

Example #60

0

Show file

 def predict(self, x):
     x = _convert_to_vector(x)
     margin = self.weights.dot(x) + self.intercept
     return 1 if margin >= 0 else 0