def loadLabeledPoints(sc, path, minPartitions=None): """ Load labeled points saved using RDD.saveAsTextFile. @param sc: Spark context @param path: file or directory path in any Hadoop-supported file system URI @param minPartitions: min number of partitions @return: labeled data stored as an RDD of LabeledPoint >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \ LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.close() >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name) >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect() >>> type(loaded[0]) == LabeledPoint True >>> print examples[0] (1.1,(3,[0,2],[-1.23,4.56e-07])) >>> type(examples[1]) == LabeledPoint True >>> print examples[1] (0.0,[1.01,2.02,3.03]) """ minPartitions = minPartitions or min(sc.defaultParallelism, 2) jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints( sc._jsc, path, minPartitions) serialized = RDD(jSerialized, sc, NoOpSerializer()) return serialized.map( lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
def calculate_value_to_index(self, rdd: RDD): column_name = self.column if isinstance(column_name, str): column = rdd.map(lambda x: getattr(x, column_name)) else: column = rdd.map(lambda x: x[column_name]) self._log.info("Collecting the list of distinct sorted values (%s)", column_name) values = column.distinct() if self.explained: self._log.info("toDebugString():\n%s", values.toDebugString().decode()) values = values.collect() values.sort() # We do not expect an extraordinary number of distinct values self._log.info("%d distinct values", len(values)) if len(values) == 0: raise RuntimeError("Number of distinct values is zero.") self._value_to_index = {d: i for i, d in enumerate(values)}
def __call__(self, rdd: RDD): column_name = self.column if self._value_to_index is None: self.calculate_value_to_index(rdd) column2id = rdd.context.broadcast(self._value_to_index) def index_column(row): """ Map column_id column to its index value stored in column2id. WARNING: due to pyspark documentation (http://spark.apache.org/docs/latest/rdd-programming-guide.html#passing-functions-to-spark) do not use self inside this function. It will be suboptimal and probably fail to run. Please contact me if you have troubles: [email protected] """ if isinstance(column_name, str): assert isinstance(row, Row) row_dict = row.asDict() row_dict[column_name] = column2id.value[row_dict[column_name]] return Row(**row_dict) return row[:column_name] + ( column2id.value[row[column_name]], ) + row[column_name + 1:] indexed_rdd = rdd.map(index_column) column2id.unpersist(blocking=True) return indexed_rdd
def loadLabeledPoints(sc, path, minPartitions=None): """ Load labeled points saved using RDD.saveAsTextFile. @param sc: Spark context @param path: file or directory path in any Hadoop-supported file system URI @param minPartitions: min number of partitions @return: labeled data stored as an RDD of LabeledPoint >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \ LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.close() >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name) >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect() >>> type(loaded[0]) == LabeledPoint True >>> print examples[0] (1.1,(3,[0,2],[-1.23,4.56e-07])) >>> type(examples[1]) == LabeledPoint True >>> print examples[1] (0.0,[1.01,2.02,3.03]) """ minPartitions = minPartitions or min(sc.defaultParallelism, 2) jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions) serialized = RDD(jSerialized, sc, NoOpSerializer()) return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Poisson distribution with the input mean. >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L) >>> mat = np.mat(rdd.collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(mat.std() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI() \ .poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map( lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def uniformRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the uniform distribution U(0.0, 1.0). To transform the distribution in the generated RDD from U(0.0, 1.0) to U(a, b), use C{RandomRDDs.uniformRDD(sc, n, p, seed)\ .map(lambda v: a + (b - a) * v)} >>> x = RandomRDDs.uniformRDD(sc, 100).collect() >>> len(x) 100 >>> max(x) <= 1.0 and min(x) >= 0.0 True >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions() 4 >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions() >>> parts == sc.defaultParallelism True """ jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def rank_using_reverted_index(index: RDD): """ :param index: :return: """ return index.map(lambda lang_article: (lang_article[0], len(lang_article[1]))) \ .sortBy(lambda k: k[1], ascending=False) \ .collect()
def summary_cluster(cluster_result_: RDD) -> dict: """ Get the summary information of the discard set/ compression set :param cluster_result_: pair RDD, key is point_id, value is (cluster_id, feature vector) :return: a dictionary containing the summary information """ cluster_sum = cluster_result_. \ values(). \ flatMapValues(lambda features: [(feature_index, (feature, feature ** 2)) for feature_index, feature in enumerate(features)]). \ map(lambda pair: ((pair[0], pair[1][0]), pair[1][1])). \ reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])). \ map(lambda pair: (pair[0][0], (pair[0][1], (pair[1][0], pair[1][1])))). \ groupByKey(). \ mapValues(sorted). \ mapValues(lambda values: [value for feature_index, value in values]). \ mapValues(lambda values: [value for value, _ in values]).\ persist() cluster_sum = dict(cluster_sum.collect()) cluster_assignments = dict( cluster_result_.map(lambda pair: (pair[1][0], pair[0])). groupByKey().mapValues(set).collect()) cluster_point_index = set(cluster_result_.keys().collect()) cluster_count = dict( cluster_result_.map(lambda pair: (pair[1][0], 1)).reduceByKey( lambda x, y: x + y).collect()) cluster_centroids = dict() for cid in cluster_sum.keys(): cluster_size = cluster_count[cid] cluster_centroids[cid] = [ Sum / cluster_size for Sum in cluster_sum[cid] ] return { "assignments": cluster_assignments, "point_index": cluster_point_index, "count": cluster_count, "sum": cluster_sum, "centroids": cluster_centroids }
def log_likelihood(self, rdd: RDD) -> float: """Returns the log-likelihood that this model generated the given data distribution. Parameters ---------- rdd: RDD Elements are assumed to be floats. Returns ------- float The log-likelihood that this model generated the given data distribution. """ return rdd.map(lambda x: math.log(self.pdf(x))).sum() # action
def fit(self, data: RDD): # data.cache() # 初始化聚类中心 clusters = self._init_clusters(data) for i in range(self.max_iter): # 计算data和聚类中心的距离 dists = data.map(lambda x: (x, [cal_distance(x, i) for i in clusters])) # 找到每条数据最近的聚类中心 tmp = dists.map(lambda x: (np.argmin(x[1]), x[0])) # 生成新的聚类中心 clusters = tmp.mapValues(lambda x: (x, 1)).reduceByKey( lambda x, y: (x[0] + y[0], x[1] + y[1])).map( lambda x: x[1][0] / x[1][1]).collect() self.clusters = clusters return self
def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the standard normal distribution. >>> import numpy as np >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - 0.0) < 0.1 True >>> abs(mat.std() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI().normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the uniform distribution U(0.0, 1.0). >>> import numpy as np >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape (10, 10) >>> mat.max() <= 1.0 and mat.min() >= 0.0 True >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() 4 """ jrdd = sc._jvm.PythonMLLibAPI().uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def createRDD(sc, kafkaParams, offsetRanges, leaders=None, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ .. note:: Experimental Create a RDD from Kafka using offset ranges for each topic and partition. :param sc: SparkContext object :param kafkaParams: Additional params for Kafka :param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty map, in which case leaders will be looked up on the driver. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A RDD object """ if leaders is None: leaders = dict() if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") if not isinstance(offsetRanges, list): raise TypeError("offsetRanges should be list") try: helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") helper = helperClass.newInstance() joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges] jleaders = dict([(k._jTopicAndPartition(helper), v._jBroker(helper)) for (k, v) in leaders.items()]) jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges, jleaders) except Py4JJavaError as e: if 'ClassNotFoundException' in str(e.java_exception): KafkaUtils._printErrorMsg(sc) raise e ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) rdd = RDD(jrdd, sc, ser) return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the uniform distribution U(0.0, 1.0). >>> import numpy as np >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape (10, 10) >>> mat.max() <= 1.0 and mat.min() >= 0.0 True >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() 4 """ jrdd = sc._jvm.PythonMLLibAPI() \ .uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the standard normal distribution. >>> import numpy as np >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - 0.0) < 0.1 True >>> abs(mat.std() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI() \ .normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def poissonRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d samples from the Poisson distribution with the input mean. >>> mean = 100.0 >>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(stats.stdev() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def poissonRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean. >>> mean = 100.0 >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(stats.stdev() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Poisson distribution with the input mean. >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L) >>> mat = np.mat(rdd.collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(mat.std() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI().poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d samples from the standard normal distribution. To transform the distribution in the generated RDD from standard normal to some other normal N(mean, sigma), use C{RandomRDDGenerators.normal(sc, n, p, seed)\ .map(lambda v: mean + sigma * v)} >>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - 0.0) < 0.1 True >>> abs(stats.stdev() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the standard normal distribution. To transform the distribution in the generated RDD from standard normal to some other normal N(mean, sigma^2), use C{RandomRDDs.normal(sc, n, p, seed)\ .map(lambda v: mean + sigma * v)} >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - 0.0) < 0.1 True >>> abs(stats.stdev() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def createRDD(sc, kafkaParams, offsetRanges, leaders={}, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ .. note:: Experimental Create a RDD from Kafka using offset ranges for each topic and partition. :param sc: SparkContext object :param kafkaParams: Additional params for Kafka :param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty map, in which case leaders will be looked up on the driver. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A RDD object """ if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") if not isinstance(offsetRanges, list): raise TypeError("offsetRanges should be list") try: helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") helper = helperClass.newInstance() joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges] jleaders = dict([(k._jTopicAndPartition(helper), v._jBroker(helper)) for (k, v) in leaders.items()]) jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges, jleaders) except Py4JJavaError as e: if 'ClassNotFoundException' in str(e.java_exception): KafkaUtils._printErrorMsg(sc) raise e ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) rdd = RDD(jrdd, sc, ser) return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
def uniformRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the uniform distribution on [0.0, 1.0]. To transform the distribution in the generated RDD from U[0.0, 1.0] to U[a, b], use C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\ .map(lambda v: a + (b - a) * v)} >>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect() >>> len(x) 100 >>> max(x) <= 1.0 and min(x) >= 0.0 True >>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions() 4 >>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions() >>> parts == sc.defaultParallelism True """ jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def predict(self, data: RDD): dists = data.map(lambda x: (x, [cal_distance(x, i) for i in self.clusters])) tmp = dists.map(lambda x: np.argmin(x[1])) return tmp.collect()