def getAllFeaturesRDD(groupedDataRDD): """ Get all the features present in grouped dataset groupedDataRDD. The input is: - groupedDataRDD: a groupedRDD containing pairs of the form (partitionID,dataList), where partitionID is an integer and dataList is a list of (SparseVector(x),y) values The return value is an RDD containing the above features. """ return groupedDataRDD.values()\ .flatMap(lambda dataList:getAllFeatures(dataList))\ .distinct()
def basicStatistics(groupedDataRDD): """ Return some basic statistics about the data in each partition in groupedDataRDD """ num_datapoints = groupedDataRDD.values().map( lambda dataList: len(dataList)) num_features = groupedDataRDD.values().map( lambda dataList: len(getAllFeatures(dataList))) datapoint_stats = (num_datapoints.min(), num_datapoints.max(), num_datapoints.mean()) feature_stats = (num_features.min(), num_features.max(), num_features.mean()) return datapoint_stats, feature_stats
def mapFeaturesToPartitionsRDD(groupedDataRDD, N): """ Given a groupedDataRDD, construct an RDD connecting the partitionID to all the features present in the data list of this partition. That is, given a groupedDataRDD containing pairs of the form (partitionID,dataList) return an RDD containing *all* pairs of the form (feat,partitionID) where feat is a feature label appearing in a datapoint inside dataList associated with partitionID. The inputs are: - groupedDataRDD: RDD containing the grouped data - N: Number of partitions of the returned RDD The returned RDD is partitioned with the default hash function and cached. """ return groupedDataRDD\ .flatMap(lambda pair: [(data, pair[0]) for data in getAllFeatures(pair[1])])\ .partitionBy(N).cache()