Esempio n. 1
0
    def asDataFrames(self, *index_by):
        '''
            Reads the spanned rows as DataFrames if pandas is available, or as
            a dict of numpy arrays if only numpy is available or as a dict with
            primitives and objects otherwise.

            @param index_by If pandas is available, the dataframes will be
            indexed by the given columns.
        '''
        for c in index_by:
            if c in self.columns:
                raise ValueError(
                    'column %s cannot be used as index in the data'
                    'frames as it is a column by which the rows are spanned.')

        columns = as_java_array(self.ctx._gateway, "String",
                                (str(c) for c in self.columns))
        jrdd = self._helper.spanBy(self._crdd, columns)
        rdd = RDD(jrdd, self.ctx)

        global pd
        if index_by and pd:
            return rdd.mapValues(
                lambda _: _.set_index(*[str(c) for c in index_by]))
        else:
            return rdd
Esempio n. 2
0
	def asDataFrames(self, *index_by):
		'''
			Reads the spanned rows as DataFrames if pandas is available, or as
			a dict of numpy arrays if only numpy is available or as a dict with
			primitives and objects otherwise.
			
			@param index_by If pandas is available, the dataframes will be
			indexed by the given columns.
		'''
		for c in index_by:
			if c in self.columns:
				raise ValueError('column %s cannot be used as index in the data'
					'frames as it is a column by which the rows are spanned.') 
		
		columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns))
		jrdd = self._helper.spanBy(self._cjrdd, columns)
		rdd = RDD(jrdd, self.ctx)
		
		global pd
		if index_by and pd:
			return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by]))
		else:
			return rdd
Esempio n. 3
0
    def kmeans(data: RDD,
               num_clusters: int,
               min_cluster_id=0,
               max_iterations=20) -> RDD:
        """
        :param data: pair RDD, key is point_id, value is feature vector
        :param num_clusters: number of clusters
        :param min_cluster_id: The id of each cluster is min_cluster_id, min_cluster_id + 1, min_cluster_id + 2 ...
        :param max_iterations: maximum number of iterations
        :return: clustering result, pair RDD, key is point_id, value is (cluster id, feature vector)
        """

        # ** some helper functions:
        def minimum_distance_to_centroids(point: list,
                                          centroids: dict) -> float:
            min_distance = min([
                sum([(i - j)**2 for i, j in zip(point, centroid)])
                for centroid in centroids
            ])
            return min_distance

        def assign_point_to_cluster(point: list, centroids_: dict) -> int:
            "Given a data point, find the nearest centroid to it"
            distances = [(sum([(i - j)**2
                               for i, j in zip(point, centroid)]), cid)
                         for cid, centroid in centroids_.items()]
            min_distance, cluster_index = min(distances)
            return cluster_index

        # ** initialize the first k centroids
        total_samples = data.count()
        print("initializing centroids")
        print("total samples:", total_samples)
        sample_fraction = 0.3
        if total_samples * sample_fraction < num_clusters:
            if total_samples < num_clusters:
                centroids = data.values().collect()
            else:
                centroids = data.values().take(num_clusters)
        else:
            sample_data = data.sample(withReplacement=False,
                                      fraction=sample_fraction,
                                      seed=10)
            centroids = [sample_data.first()[1]
                         ]  # add first centroid to the collection of centroids
            for _ in range(num_clusters - 1):  # find the next k-1 centroids
                distances = sample_data.\
                    mapValues(lambda values: minimum_distance_to_centroids(values, centroids)).persist()
                furthest_point = sample_data.lookup(
                    distances.max(lambda pair: pair[1])[0])[0]
                centroids.append(
                    furthest_point)  # furthest from already selected points

        centroids = [(min_cluster_id + i, centroid)
                     for i, centroid in enumerate(centroids)]  # assign index
        centroids = dict(centroids)

        # ** perform clustering
        num_iterations = 0
        previous_cluster_sizes = dict()
        while num_iterations < max_iterations:
            print("current iteration:", num_iterations)
            print("assign points to clusters")
            cluster_result: RDD = data.mapValues(lambda values: (
                assign_point_to_cluster(values, centroids), values)).persist()

            cluster_sizes = cluster_result.map(
                lambda pair: (pair[1][0], 1)).reduceByKey(lambda x, y: x + y)
            cluster_sizes = dict(cluster_sizes.collect())
            if num_iterations > 0:  # after first iteration
                num_reassigments = [
                    (cluster_sizes[cid] - previous_cluster_sizes[cid])
                    for cid in cluster_sizes.keys()
                ]
                print("num_reassigments:", num_reassigments)
                if max(num_reassigments) - min(
                        num_reassigments) < 3:  # Now the process converges
                    break

            previous_cluster_sizes = cluster_sizes.copy()
            print("update centroids_:")
            new_centroids = cluster_result. \
                values(). \
                flatMapValues(lambda features: [(i, feature) for i, feature in enumerate(features)]). \
                map(lambda pair: ((pair[0], pair[1][0]), (pair[1][1], 1))). \
                reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])). \
                mapValues(lambda values: values[0] / values[1]). \
                map(lambda pair: (pair[0][0], (pair[0][1], pair[1]))). \
                groupByKey(). \
                mapValues(sorted). \
                mapValues(lambda values: [value for feature_index, value in values]). \
                persist()
            new_centroids = dict(new_centroids.collect())
            centroids = new_centroids.copy()
            num_iterations += 1

        print("Converged. Total iterations:", num_iterations)
        print(
            "cluster size",
            cluster_result.values().mapValues(
                lambda values: 1).groupByKey().mapValues(len).collect())
        return cluster_result