def uniformRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the uniform distribution U(0.0, 1.0). To transform the distribution in the generated RDD from U(0.0, 1.0) to U(a, b), use C{RandomRDDs.uniformRDD(sc, n, p, seed)\ .map(lambda v: a + (b - a) * v)} >>> x = RandomRDDs.uniformRDD(sc, 100).collect() >>> len(x) 100 >>> max(x) <= 1.0 and min(x) >= 0.0 True >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions() 4 >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions() >>> parts == sc.defaultParallelism True """ jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def poissonRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean. >>> mean = 100.0 >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(stats.stdev() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def poissonRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d samples from the Poisson distribution with the input mean. >>> mean = 100.0 >>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(stats.stdev() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ pythonAPI = self._sc._jvm.PythonMLLibAPI() if isinstance(x, RDD): # Bulk prediction if x.count() == 0: return self._sc.parallelize([]) dataBytes = _get_unmangled_double_vector_rdd(x, cache=False) jSerializedPreds = \ pythonAPI.predictDecisionTreeModel(self._java_model, dataBytes._jrdd) serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer()) return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes))) else: # Assume x is a single data point. x_ = _serialize_double_vector(x) return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d samples from the standard normal distribution. To transform the distribution in the generated RDD from standard normal to some other normal N(mean, sigma), use C{RandomRDDGenerators.normal(sc, n, p, seed)\ .map(lambda v: mean + sigma * v)} >>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - 0.0) < 0.1 True >>> abs(stats.stdev() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the standard normal distribution. To transform the distribution in the generated RDD from standard normal to some other normal N(mean, sigma^2), use C{RandomRDDs.normal(sc, n, p, seed)\ .map(lambda v: mean + sigma * v)} >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - 0.0) < 0.1 True >>> abs(stats.stdev() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def uniformRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the uniform distribution on [0.0, 1.0]. To transform the distribution in the generated RDD from U[0.0, 1.0] to U[a, b], use C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\ .map(lambda v: a + (b - a) * v)} >>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect() >>> len(x) 100 >>> max(x) <= 1.0 and min(x) >= 0.0 True >>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions() 4 >>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions() >>> parts == sc.defaultParallelism True """ jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))