def asDataFrames(self, *index_by): ''' Reads the spanned rows as DataFrames if pandas is available, or as a dict of numpy arrays if only numpy is available or as a dict with primitives and objects otherwise. @param index_by If pandas is available, the dataframes will be indexed by the given columns. ''' for c in index_by: if c in self.columns: raise ValueError( 'column %s cannot be used as index in the data' 'frames as it is a column by which the rows are spanned.') columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns)) jrdd = self._helper.spanBy(self._crdd, columns) rdd = RDD(jrdd, self.ctx) global pd if index_by and pd: return rdd.mapValues( lambda _: _.set_index(*[str(c) for c in index_by])) else: return rdd
def asDataFrames(self, *index_by): ''' Reads the spanned rows as DataFrames if pandas is available, or as a dict of numpy arrays if only numpy is available or as a dict with primitives and objects otherwise. @param index_by If pandas is available, the dataframes will be indexed by the given columns. ''' for c in index_by: if c in self.columns: raise ValueError('column %s cannot be used as index in the data' 'frames as it is a column by which the rows are spanned.') columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns)) jrdd = self._helper.spanBy(self._cjrdd, columns) rdd = RDD(jrdd, self.ctx) global pd if index_by and pd: return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by])) else: return rdd
def kmeans(data: RDD, num_clusters: int, min_cluster_id=0, max_iterations=20) -> RDD: """ :param data: pair RDD, key is point_id, value is feature vector :param num_clusters: number of clusters :param min_cluster_id: The id of each cluster is min_cluster_id, min_cluster_id + 1, min_cluster_id + 2 ... :param max_iterations: maximum number of iterations :return: clustering result, pair RDD, key is point_id, value is (cluster id, feature vector) """ # ** some helper functions: def minimum_distance_to_centroids(point: list, centroids: dict) -> float: min_distance = min([ sum([(i - j)**2 for i, j in zip(point, centroid)]) for centroid in centroids ]) return min_distance def assign_point_to_cluster(point: list, centroids_: dict) -> int: "Given a data point, find the nearest centroid to it" distances = [(sum([(i - j)**2 for i, j in zip(point, centroid)]), cid) for cid, centroid in centroids_.items()] min_distance, cluster_index = min(distances) return cluster_index # ** initialize the first k centroids total_samples = data.count() print("initializing centroids") print("total samples:", total_samples) sample_fraction = 0.3 if total_samples * sample_fraction < num_clusters: if total_samples < num_clusters: centroids = data.values().collect() else: centroids = data.values().take(num_clusters) else: sample_data = data.sample(withReplacement=False, fraction=sample_fraction, seed=10) centroids = [sample_data.first()[1] ] # add first centroid to the collection of centroids for _ in range(num_clusters - 1): # find the next k-1 centroids distances = sample_data.\ mapValues(lambda values: minimum_distance_to_centroids(values, centroids)).persist() furthest_point = sample_data.lookup( distances.max(lambda pair: pair[1])[0])[0] centroids.append( furthest_point) # furthest from already selected points centroids = [(min_cluster_id + i, centroid) for i, centroid in enumerate(centroids)] # assign index centroids = dict(centroids) # ** perform clustering num_iterations = 0 previous_cluster_sizes = dict() while num_iterations < max_iterations: print("current iteration:", num_iterations) print("assign points to clusters") cluster_result: RDD = data.mapValues(lambda values: ( assign_point_to_cluster(values, centroids), values)).persist() cluster_sizes = cluster_result.map( lambda pair: (pair[1][0], 1)).reduceByKey(lambda x, y: x + y) cluster_sizes = dict(cluster_sizes.collect()) if num_iterations > 0: # after first iteration num_reassigments = [ (cluster_sizes[cid] - previous_cluster_sizes[cid]) for cid in cluster_sizes.keys() ] print("num_reassigments:", num_reassigments) if max(num_reassigments) - min( num_reassigments) < 3: # Now the process converges break previous_cluster_sizes = cluster_sizes.copy() print("update centroids_:") new_centroids = cluster_result. \ values(). \ flatMapValues(lambda features: [(i, feature) for i, feature in enumerate(features)]). \ map(lambda pair: ((pair[0], pair[1][0]), (pair[1][1], 1))). \ reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])). \ mapValues(lambda values: values[0] / values[1]). \ map(lambda pair: (pair[0][0], (pair[0][1], pair[1]))). \ groupByKey(). \ mapValues(sorted). \ mapValues(lambda values: [value for feature_index, value in values]). \ persist() new_centroids = dict(new_centroids.collect()) centroids = new_centroids.copy() num_iterations += 1 print("Converged. Total iterations:", num_iterations) print( "cluster size", cluster_result.values().mapValues( lambda values: 1).groupByKey().mapValues(len).collect()) return cluster_result