def appendBias(data): """ Returns a new vector with `1.0` (bias) appended to the end of the input vector. """ vec = _convert_to_vector(data) if isinstance(vec, SparseVector): newIndices = np.append(vec.indices, len(vec)) newValues = np.append(vec.values, 1.0) return SparseVector(len(vec) + 1, newIndices, newValues) else: return _convert_to_vector(np.append(vec.toArray(), 1.0))
def _regression_train_wrapper(train_func, modelClass, data, initial_weights): from pyspark.mllib.classification import LogisticRegressionModel first = data.first() if not isinstance(first, LabeledPoint): raise TypeError("data should be an RDD of LabeledPoint, but got %s" % type(first)) if initial_weights is None: initial_weights = [0.0] * len(data.first().features) if (modelClass == LogisticRegressionModel): weights, intercept, numFeatures, numClasses = train_func( data, _convert_to_vector(initial_weights)) return modelClass(weights, intercept, numFeatures, numClasses) else: weights, intercept = train_func(data, _convert_to_vector(initial_weights)) return modelClass(weights, intercept)
def predictSoft(self, x): """ Find the membership of point 'x' or each point in RDD 'x' to all mixture components. :param x: vector or RDD of vector represents data points. :return: the membership value to all mixture components for vector 'x' or each vector in RDD 'x'. """ if isinstance(x, RDD): means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians]) membership_matrix = callMLlibFunc( "predictSoftGMM", x.map(_convert_to_vector), _convert_to_vector(self.weights), means, sigmas ) return membership_matrix.map(lambda x: pyarray.array("d", x)) else: return self.call("predictSoft", _convert_to_vector(x)).toArray()
def save(self, sc, path): """ Save this model to the given path. """ java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers]) java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers) java_model.save(sc._jsc.sc(), path)
def train( cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None, ): """Train a k-means clustering model.""" clusterInitialModel = [] if initialModel is not None: if not isinstance(initialModel, KMeansModel): raise Exception( "initialModel is of " + str(type(initialModel)) + ". It needs " "to be of <type 'KMeansModel'>" ) clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters] model = callMLlibFunc( "trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations, runs, initializationMode, seed, initializationSteps, epsilon, clusterInitialModel, ) centers = callJavaFunc(rdd.context, model.clusterCenters) return KMeansModel([c.toArray() for c in centers])
def update(self, data, decayFactor, timeUnit): """Update the centroids, according to data :param data: RDD with new data for the model update. :param decayFactor: Forgetfulness of the previous centroids. :param timeUnit: Can be "batches" or "points". If points, then the decay factor is raised to the power of number of new points and if batches, then decay factor will be used as is. """ if not isinstance(data, RDD): raise TypeError("Data should be of an RDD, got %s." % type(data)) data = data.map(_convert_to_vector) decayFactor = float(decayFactor) if timeUnit not in ["batches", "points"]: raise ValueError( "timeUnit should be 'batches' or 'points', got %s." % timeUnit) vectorCenters = [_convert_to_vector(center) for center in self.centers] updatedModel = callMLlibFunc( "updateStreamingKMeansModel", vectorCenters, self._clusterWeights, data, decayFactor, timeUnit) self.centers = array(updatedModel[0]) self._clusterWeights = list(updatedModel[1]) return self
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ SerDe = self._sc._jvm.SerDe ser = PickleSerializer() if isinstance(x, RDD): # Bulk prediction first = x.take(1) if not first: return self._sc.parallelize([]) if not isinstance(first[0], Vector): x = x.map(_convert_to_vector) jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD() jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred) return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024)) else: # Assume x is a single data point. bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) return self._java_model.predict(vec)
def predict(self, x): """ Predict the value of the dependent variable given a vector x containing values for the independent variables. """ x = _convert_to_vector(x) return self.weights.dot(x) + self.intercept
def _regression_train_wrapper(train_func, modelClass, data, initial_weights): first = data.first() if not isinstance(first, LabeledPoint): raise ValueError("data should be an RDD of LabeledPoint, but got %s" % first) initial_weights = initial_weights or [0.0] * len(data.first().features) weights, intercept = train_func(data, _convert_to_vector(initial_weights)) return modelClass(weights, intercept)
def computeCost(self, rdd): """ Return the K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. """ cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector), [_convert_to_vector(c) for c in self.centers]) return cost
def test_serialize(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 1 lil[3, 0] = 2 sv = SparseVector(4, {1: 1, 3: 2}) self.assertEqual(sv, _convert_to_vector(lil)) self.assertEqual(sv, _convert_to_vector(lil.tocsc())) self.assertEqual(sv, _convert_to_vector(lil.tocoo())) self.assertEqual(sv, _convert_to_vector(lil.tocsr())) self.assertEqual(sv, _convert_to_vector(lil.todok())) def serialize(l): return ser.loads(ser.dumps(_convert_to_vector(l))) self.assertEqual(sv, serialize(lil)) self.assertEqual(sv, serialize(lil.tocsc())) self.assertEqual(sv, serialize(lil.tocsr())) self.assertEqual(sv, serialize(lil.todok()))
def predict(self, x): x = _convert_to_vector(x) margin = self.weights.dot(x) + self._intercept if margin > 0: prob = 1 / (1 + exp(-margin)) else: exp_margin = exp(margin) prob = exp_margin / (1 + exp_margin) return 1 if prob > 0.5 else 0
def setInitialWeights(self, initialWeights): """ Set the initial value of weights. This must be set before running trainOn and predictOn """ initialWeights = _convert_to_vector(initialWeights) self._model = LinearRegressionModel(initialWeights, 0) return self
def predict(self, x): """ Predict the value of the dependent variable given a vector or an RDD of vectors containing values for the independent variables. """ if isinstance(x, RDD): return x.map(self.predict) x = _convert_to_vector(x) return self.weights.dot(x) + self.intercept
def predict(self, x): """ Return the most likely class for a data vector or an RDD of vectors """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) x = _convert_to_vector(x) return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]
def predict_all(self, x): if isinstance(x, RDD): return x.map(lambda v: self.predict_all(v)) x = _convert_to_vector(x) log_probs = self.pi + x.dot(self.theta.transpose()) scaled_log_probs = scale(log_probs) int_lables = [int(l_i) for l_i in self.labels] labels_and_log_probs = zip(int_lables, scaled_log_probs) return sorted(labels_and_log_probs, key=lambda x: x[1], reverse=True)
def test_convert_to_vector(self): from scipy.sparse import csc_matrix # Create a CSC matrix with non-sorted indices indptr = array([0, 2]) indices = array([3, 1]) data = array([2.0, 1.0]) csc = csc_matrix((data, indices, indptr)) self.assertFalse(csc.has_sorted_indices) sv = SparseVector(4, {1: 1, 3: 2}) self.assertEqual(sv, _convert_to_vector(csc))
def transform(self, vector): """ Computes the Hadamard product of the vector. """ if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) else: vector = _convert_to_vector(vector) return callMLlibFunc("elementwiseProductVector", self.scalingVector, vector)
def predict(self, x): """ Predict values for a single data point or an RDD of points using the model trained. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) else: return self.call("predict", _convert_to_vector(x))
def findSynonyms(self, word, num): """ Find "num" number of words closest in similarity to "word". word can be a string or vector representation. Returns a dataframe with two fields word and similarity (which gives the cosine similarity). """ if not isinstance(word, basestring): word = _convert_to_vector(word) return self._call_java("findSynonyms", word, num)
def setInitialWeights(self, initialWeights): """ Set the initial value of weights. This must be set before running trainOn and predictOn. """ initialWeights = _convert_to_vector(initialWeights) # LogisticRegressionWithSGD does only binary classification. self._model = LogisticRegressionModel(initialWeights, 0, initialWeights.size, 2) return self
def predict(self, x): """Find the cluster to which x belongs in this model.""" best = 0 best_distance = float("inf") x = _convert_to_vector(x) for i in xrange(len(self.centers)): distance = x.squared_distance(self.centers[i]) if distance < best_distance: best = i best_distance = distance return best
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None): """ Train a k-means clustering model. :param rdd: Training points as an `RDD` of `Vector` or convertible sequence types. :param k: Number of clusters to create. :param maxIterations: Maximum number of iterations allowed. (default: 100) :param runs: Number of runs to execute in parallel. The best model according to the cost function will be returned (deprecated in 1.6.0). (default: 1) :param initializationMode: The initialization algorithm. This can be either "random" or "k-means||". (default: "k-means||") :param seed: Random seed value for cluster initialization. Set as None to generate seed based on system time. (default: None) :param initializationSteps: Number of steps for the k-means|| initialization mode. This is an advanced setting -- the default of 5 is almost always enough. (default: 5) :param epsilon: Distance threshold within which a center will be considered to have converged. If all centers move less than this Euclidean distance, iterations are stopped. (default: 1e-4) :param initialModel: Initial cluster centers can be provided as a KMeansModel object rather than using the random or k-means|| initializationModel. (default: None) """ if runs != 1: warnings.warn( "Support for runs is deprecated in 1.6.0. This param will have no effect in 2.0.0.") clusterInitialModel = [] if initialModel is not None: if not isinstance(initialModel, KMeansModel): raise Exception("initialModel is of "+str(type(initialModel))+". It needs " "to be of <type 'KMeansModel'>") clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters] model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations, runs, initializationMode, seed, initializationSteps, epsilon, clusterInitialModel) centers = callJavaFunc(rdd.context, model.clusterCenters) return KMeansModel([c.toArray() for c in centers])
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights): initial_weights = initial_weights or [0.0] * len(data.first().features) ser = PickleSerializer() initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights))) # use AutoBatchedSerializer before cache to reduce the memory # overhead in JVM cached = data._reserialize(AutoBatchedSerializer(ser)).cache() ans = train_func(cached._to_java_object_rdd(), initial_bytes) assert len(ans) == 2, "JVM call result had unexpected length" weights = ser.loads(str(ans[0])) return modelClass(weights, ans[1])
def predictSoft(self, x): """ Find the membership of each point in 'x' to all mixture components. :param x: RDD of data points. :return: membership_matrix. RDD of array of double values. """ if isinstance(x, RDD): means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians]) membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector), _convert_to_vector(self.weights), means, sigmas) return membership_matrix.map(lambda x: pyarray.array('d', x))
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) else: return self.call("predict", _convert_to_vector(x))
def transform(self, vector): """ Applies unit length normalization on a vector. :param vector: vector or RDD of vector to be normalized. :return: normalized vector. If the norm of the input is zero, it will return the input vector. """ if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) else: vector = _convert_to_vector(vector) return callMLlibFunc("normalizeVector", self.p, vector)
def _convert_labeled_point_to_libsvm(p): """Converts a LabeledPoint to a string in LIBSVM format.""" assert isinstance(p, LabeledPoint) items = [str(p.label)] v = _convert_to_vector(p.features) if isinstance(v, SparseVector): nnz = len(v.indices) for i in xrange(nnz): items.append(str(v.indices[i] + 1) + ":" + str(v.values[i])) else: for i in xrange(len(v)): items.append(str(i + 1) + ":" + str(v[i])) return " ".join(items)
def computeCost(self, x): """ Return the Bisecting K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. If provided with an RDD of points returns the sum. :param point: the point or RDD of points to compute the cost(s). """ if isinstance(x, RDD): vecs = x.map(_convert_to_vector) return self.call("computeCost", vecs) return self.call("computeCost", _convert_to_vector(x))
def findSynonyms(self, word, num): """ Find synonyms of a word :param word: a word or a vector representation of word :param num: number of synonyms to find :return: array of (word, cosineSimilarity) Note: local use only """ if not isinstance(word, basestring): word = _convert_to_vector(word) words, similarity = self.call("findSynonyms", word, num) return zip(words, similarity)
def predictSoft(self, x): """ Find the membership of each point in 'x' to all mixture components. :param x: RDD of data points. :return: membership_matrix. RDD of array of double values. """ if isinstance(x, RDD): means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians]) membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector), _convert_to_vector(self.weights), means, sigmas) return membership_matrix.map(lambda x: pyarray.array('d', x)) else: raise TypeError("x should be represented by an RDD, " "but got %s." % type(x))
def predict(self, x): """ Find the cluster that each of the points belongs to in this model. :param x: A data point (or RDD of points) to determine cluster index. :return: Predicted cluster index or an RDD of predicted cluster indices if the input is an RDD. """ if isinstance(x, RDD): vecs = x.map(_convert_to_vector) return self.call("predict", vecs) x = _convert_to_vector(x) return self.call("predict", x)
def predict( self, x: Union["VectorLike", RDD["VectorLike"]] ) -> Union[RDD[Union[int, float]], Union[int, float]]: """ Predict values for a single data point or an RDD of points using the model trained. .. versionadded:: 0.9.0 """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) x = _convert_to_vector(x) if self.numClasses == 2: margin = self.weights.dot(x) + self._intercept # type: ignore[attr-defined] if margin > 0: prob = 1 / (1 + exp(-margin)) else: exp_margin = exp(margin) prob = exp_margin / (1 + exp_margin) if self._threshold is None: return prob else: return 1 if prob > self._threshold else 0 else: assert self._weightsMatrix is not None best_class = 0 max_margin = 0.0 if x.size + 1 == self._dataWithBiasSize: # type: ignore[attr-defined] for i in range(0, self._numClasses - 1): margin = ( x.dot(self._weightsMatrix[i][0 : x.size]) # type: ignore[attr-defined] + self._weightsMatrix[i][x.size] # type: ignore[attr-defined] ) if margin > max_margin: max_margin = margin best_class = i + 1 else: for i in range(0, self._numClasses - 1): margin = x.dot(self._weightsMatrix[i]) # type: ignore[attr-defined] if margin > max_margin: max_margin = margin best_class = i + 1 return best_class
def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]: """ Predict values for a single data point or an RDD of points using the model trained. .. versionadded:: 1.3.0 Notes ----- In Python, predict cannot currently be used within an RDD transformation or action. Call predict directly on the RDD instead. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) else: return self.call("predict", _convert_to_vector(x))
def predict( self, x: Union["VectorLike", RDD["VectorLike"]] ) -> Union[RDD[Union[int, float]], Union[int, float]]: """ Predict values for a single data point or an RDD of points using the model trained. .. versionadded:: 0.9.0 """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) x = _convert_to_vector(x) margin = self.weights.dot(x) + self.intercept # type: ignore[attr-defined] if self._threshold is None: return margin else: return 1 if margin > self._threshold else 0
def predict(self, x): """ Predict values for a single data point or an RDD of points using the model trained. """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) x = _convert_to_vector(x) margin = self.weights.dot(x) + self._intercept if margin > 0: prob = 1 / (1 + exp(-margin)) else: exp_margin = exp(margin) prob = exp_margin / (1 + exp_margin) if self._threshold is None: return prob else: return 1 if prob > self._threshold else 0
def computeCost(self, rdd): """ Return the K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. .. versionadded:: 1.4.0 Parameters ---------- rdd : ::py:class:`pyspark.RDD` The RDD of points to compute the cost on. """ cost = callMLlibFunc( "computeCostKmeansModel", rdd.map(_convert_to_vector), [_convert_to_vector(c) for c in self.centers], ) return cost
def transform(self, vector): """ Applies transformation on a vector or an RDD[Vector]. Parameters ---------- vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` Input vector(s) to be transformed. Notes ----- In Python, transform cannot currently be used within an RDD transformation or action. Call transform directly on the RDD instead. """ if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) else: vector = _convert_to_vector(vector) return self.call("transform", vector)
def predict(self, x): """ Find the cluster that each of the points belongs to in this model. :param x: the point (or RDD of points) to determine compute the clusters for. """ best = 0 best_distance = float("inf") if isinstance(x, RDD): return x.map(self.predict) x = _convert_to_vector(x) for i in xrange(len(self.centers)): distance = x.squared_distance(self.centers[i]) if distance < best_distance: best = i best_distance = distance return best
def computeCost(self, x): """ Return the Bisecting K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. If provided with an RDD of points returns the sum. .. versionadded:: 2.0.0 Parameters ---------- point : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` A data point (or RDD of points) to compute the cost(s). :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent objects (list, tuple, numpy.ndarray). """ if isinstance(x, RDD): vecs = x.map(_convert_to_vector) return self.call("computeCost", vecs) return self.call("computeCost", _convert_to_vector(x))
def findSynonyms(self, x, num): """ :param x: a word or a vector representation of word :param num: number of synonyms to find :return: array of (word, cosineSimilarity) Find synonyms of a word Note: local use only """ # TODO: make findSynonyms usable in RDD operations from python side ser = PickleSerializer() if type(x) == str: jlist = self._java_model.findSynonyms(x, num) else: bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) jlist = self._java_model.findSynonyms(vec, num) words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist))) return zip(words, similarity)
def transform(self, x): """ Transforms term frequency (TF) vectors to TF-IDF vectors. If `minDocFreq` was set for the IDF calculation, the terms which occur in fewer than `minDocFreq` documents will have an entry of 0. Note: In Python, transform cannot currently be used within an RDD transformation or action. Call transform directly on the RDD instead. :param x: an RDD of term frequency vectors or a term frequency vector :return: an RDD of TF-IDF vectors or a TF-IDF vector """ if isinstance(x, RDD): return JavaVectorTransformer.transform(self, x) x = _convert_to_vector(x) return JavaVectorTransformer.transform(self, x)
def transform(self, vector): """ Applies unit length normalization on a vector. .. versionadded:: 1.2.0 Parameters ---------- vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` vector or RDD of vector to be normalized. Returns ------- :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` normalized vector(s). If the norm of the input is zero, it will return the input vector. """ if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) else: vector = _convert_to_vector(vector) return callMLlibFunc("normalizeVector", self.p, vector)
def predict(self, x): """ Predict values for a single data point or an RDD of points using the model trained. """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) x = _convert_to_vector(x) if self.numClasses == 2: margin = self.weights.dot(x) + self._intercept if margin > 0: prob = 1 / (1 + exp(-margin)) else: exp_margin = exp(margin) prob = exp_margin / (1 + exp_margin) if self._threshold is None: return prob else: return 1 if prob > self._threshold else 0 else: best_class = 0 max_margin = 0.0 if x.size + 1 == self._dataWithBiasSize: for i in range(0, self._numClasses - 1): margin = ( x.dot(self._weightsMatrix[i][0 : x.size]) + self._weightsMatrix[i][x.size] ) if margin > max_margin: max_margin = margin best_class = i + 1 else: for i in range(0, self._numClasses - 1): margin = x.dot(self._weightsMatrix[i]) if margin > max_margin: max_margin = margin best_class = i + 1 return best_class
def update(self, data, decayFactor, timeUnit): """Update the centroids, according to data :param data: Should be a RDD that represents the new data. :param decayFactor: forgetfulness of the previous centroids. :param timeUnit: Can be "batches" or "points". If points, then the decay factor is raised to the power of number of new points and if batches, it is used as it is. """ if not isinstance(data, RDD): raise TypeError("Data should be of an RDD, got %s." % type(data)) data = data.map(_convert_to_vector) decayFactor = float(decayFactor) if timeUnit not in ["batches", "points"]: raise ValueError( "timeUnit should be 'batches' or 'points', got %s." % timeUnit) vectorCenters = [_convert_to_vector(center) for center in self.centers] updatedModel = callMLlibFunc("updateStreamingKMeansModel", vectorCenters, self._clusterWeights, data, decayFactor, timeUnit) self.centers = array(updatedModel[0]) self._clusterWeights = list(updatedModel[1]) return self
def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]: """ Predict the label of one or more examples. .. versionadded:: 1.1.0 Parameters ---------- x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` Data point (feature vector), or an RDD of data points (feature vectors). Notes ----- In Python, predict cannot currently be used within an RDD transformation or action. Call predict directly on the RDD instead. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) else: return self.call("predict", _convert_to_vector(x))
def update(self, data: RDD["VectorLike"], decayFactor: float, timeUnit: str) -> "StreamingKMeansModel": """Update the centroids, according to data .. versionadded:: 1.5.0 Parameters ---------- data : :py:class:`pyspark.RDD` RDD with new data for the model update. decayFactor : float Forgetfulness of the previous centroids. timeUnit : str Can be "batches" or "points". If points, then the decay factor is raised to the power of number of new points and if batches, then decay factor will be used as is. """ if not isinstance(data, RDD): raise TypeError("Data should be of an RDD, got %s." % type(data)) data = data.map(_convert_to_vector) decayFactor = float(decayFactor) if timeUnit not in ["batches", "points"]: raise ValueError( "timeUnit should be 'batches' or 'points', got %s." % timeUnit) vectorCenters = [_convert_to_vector(center) for center in self.centers] updatedModel = callMLlibFunc( "updateStreamingKMeansModel", vectorCenters, self._clusterWeights, data, decayFactor, timeUnit, ) self.centers = array(updatedModel[0]) # type: ignore[assignment] self._clusterWeights = list(updatedModel[1]) return self
def __init__(self, scalingVector): self.scalingVector = _convert_to_vector(scalingVector)
def save(self, sc, path): java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers]) java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel( java_centers) java_model.save(sc._jsc.sc(), path)
def train(cls, rdd, k, maxIterations=100, initializationMode="k-means||", seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None): """ Train a k-means clustering model. .. versionadded:: 0.9.0 Parameters ---------- rdd : ::py:class:`pyspark.RDD` Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector` or convertible sequence types. k : int Number of clusters to create. maxIterations : int, optional Maximum number of iterations allowed. (default: 100) initializationMode : str, optional The initialization algorithm. This can be either "random" or "k-means||". (default: "k-means||") seed : int, optional Random seed value for cluster initialization. Set as None to generate seed based on system time. (default: None) initializationSteps : Number of steps for the k-means|| initialization mode. This is an advanced setting -- the default of 2 is almost always enough. (default: 2) epsilon : float, optional Distance threshold within which a center will be considered to have converged. If all centers move less than this Euclidean distance, iterations are stopped. (default: 1e-4) initialModel : :py:class:`KMeansModel`, optional Initial cluster centers can be provided as a KMeansModel object rather than using the random or k-means|| initializationModel. (default: None) """ clusterInitialModel = [] if initialModel is not None: if not isinstance(initialModel, KMeansModel): raise TypeError("initialModel is of " + str(type(initialModel)) + ". It needs " "to be of <type 'KMeansModel'>") clusterInitialModel = [ _convert_to_vector(c) for c in initialModel.clusterCenters ] model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations, initializationMode, seed, initializationSteps, epsilon, clusterInitialModel) centers = callJavaFunc(rdd.context, model.clusterCenters) return KMeansModel([c.toArray() for c in centers])
def __init__(self, weights, intercept): self._coeff = _convert_to_vector(weights) self._intercept = float(intercept)
def chiSqTest(observed, expected=None): """ .. note:: Experimental If `observed` is Vector, conduct Pearson's chi-squared goodness of fit test of the observed data against the expected distribution, or againt the uniform distribution (by default), with each category having an expected frequency of `1 / len(observed)`. (Note: `observed` cannot contain negative values) If `observed` is matrix, conduct Pearson's independence test on the input contingency matrix, which cannot contain negative entries or columns or rows that sum up to 0. If `observed` is an RDD of LabeledPoint, conduct Pearson's independence test for every feature against the label across the input RDD. For each feature, the (feature, label) pairs are converted into a contingency matrix for which the chi-squared statistic is computed. All label and feature values must be categorical. :param observed: it could be a vector containing the observed categorical counts/relative frequencies, or the contingency matrix (containing either counts or relative frequencies), or an RDD of LabeledPoint containing the labeled dataset with categorical features. Real-valued features will be treated as categorical for each distinct value. :param expected: Vector containing the expected categorical counts/relative frequencies. `expected` is rescaled if the `expected` sum differs from the `observed` sum. :return: ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, the method used, and the null hypothesis. >>> from pyspark.mllib.linalg import Vectors, Matrices >>> observed = Vectors.dense([4, 6, 5]) >>> pearson = Statistics.chiSqTest(observed) >>> print pearson.statistic 0.4 >>> pearson.degreesOfFreedom 2 >>> print round(pearson.pValue, 4) 0.8187 >>> pearson.method u'pearson' >>> pearson.nullHypothesis u'observed follows the same distribution as expected.' >>> observed = Vectors.dense([21, 38, 43, 80]) >>> expected = Vectors.dense([3, 5, 7, 20]) >>> pearson = Statistics.chiSqTest(observed, expected) >>> print round(pearson.pValue, 4) 0.0027 >>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] >>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) >>> print round(chi.statistic, 4) 21.9958 >>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])), ... LabeledPoint(0.0, Vectors.dense([1.5, 20.0])), ... LabeledPoint(1.0, Vectors.dense([1.5, 30.0])), ... LabeledPoint(0.0, Vectors.dense([3.5, 30.0])), ... LabeledPoint(0.0, Vectors.dense([3.5, 40.0])), ... LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),] >>> rdd = sc.parallelize(data, 4) >>> chi = Statistics.chiSqTest(rdd) >>> print chi[0].statistic 0.75 >>> print chi[1].statistic 1.5 """ if isinstance(observed, RDD): if not isinstance(observed.first(), LabeledPoint): raise ValueError("observed should be an RDD of LabeledPoint") jmodels = callMLlibFunc("chiSqTest", observed) return [ChiSqTestResult(m) for m in jmodels] if isinstance(observed, Matrix): jmodel = callMLlibFunc("chiSqTest", observed) else: if expected and len(expected) != len(observed): raise ValueError("`expected` should have same length with `observed`") jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected) return ChiSqTestResult(jmodel)
def __init__(self, label, features): self.label = float(label) self.features = _convert_to_vector(features)
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None): """ Train a k-means clustering model. :param rdd: Training points as an `RDD` of `Vector` or convertible sequence types. :param k: Number of clusters to create. :param maxIterations: Maximum number of iterations allowed. (default: 100) :param runs: This param has no effect since Spark 2.0.0. :param initializationMode: The initialization algorithm. This can be either "random" or "k-means||". (default: "k-means||") :param seed: Random seed value for cluster initialization. Set as None to generate seed based on system time. (default: None) :param initializationSteps: Number of steps for the k-means|| initialization mode. This is an advanced setting -- the default of 2 is almost always enough. (default: 2) :param epsilon: Distance threshold within which a center will be considered to have converged. If all centers move less than this Euclidean distance, iterations are stopped. (default: 1e-4) :param initialModel: Initial cluster centers can be provided as a KMeansModel object rather than using the random or k-means|| initializationModel. (default: None) """ if runs != 1: warnings.warn("The param `runs` has no effect since Spark 2.0.0.") clusterInitialModel = [] if initialModel is not None: if not isinstance(initialModel, KMeansModel): raise Exception("initialModel is of " + str(type(initialModel)) + ". It needs " "to be of <type 'KMeansModel'>") clusterInitialModel = [ _convert_to_vector(c) for c in initialModel.clusterCenters ] model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations, runs, initializationMode, seed, initializationSteps, epsilon, clusterInitialModel) centers = callJavaFunc(rdd.context, model.clusterCenters) return KMeansModel([c.toArray() for c in centers])
def transform(self, vector): if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) else: vector = _convert_to_vector(vector) return self.call("transform", vector)
def __init__(self, index, vector): self.index = long(index) self.vector = _convert_to_vector(vector)
def serialize(l): return ser.loads(ser.dumps(_convert_to_vector(l)))
def predict(self, x): """Return the most likely class for a data vector x""" x = _convert_to_vector(x) return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]
def predict(self, x): x = _convert_to_vector(x) margin = self.weights.dot(x) + self.intercept return 1 if margin >= 0 else 0