Beispiel #1
0
def getAllFeaturesRDD(groupedDataRDD):
    """ Get all the features present in grouped dataset groupedDataRDD.
 
        The input is:
            - groupedDataRDD: a groupedRDD containing pairs of the form (partitionID,dataList), where 
              partitionID is an integer and dataList is a list of (SparseVector(x),y) values

        The return value is an RDD containing the above features.
    """
    return groupedDataRDD.values()\
                         .flatMap(lambda dataList:getAllFeatures(dataList))\
                         .distinct()
Beispiel #2
0
def basicStatistics(groupedDataRDD):
    """ Return some basic statistics about the data in each partition in groupedDataRDD
    """
    num_datapoints = groupedDataRDD.values().map(
        lambda dataList: len(dataList))
    num_features = groupedDataRDD.values().map(
        lambda dataList: len(getAllFeatures(dataList)))

    datapoint_stats = (num_datapoints.min(), num_datapoints.max(),
                       num_datapoints.mean())
    feature_stats = (num_features.min(), num_features.max(),
                     num_features.mean())

    return datapoint_stats, feature_stats
Beispiel #3
0
def mapFeaturesToPartitionsRDD(groupedDataRDD, N):
    """ Given a groupedDataRDD, construct an RDD connecting the partitionID
        to all the features present in the data list of this partition. That is,
        given a groupedDataRDD containing pairs of the form

              (partitionID,dataList)
        
        return an RDD containing *all* pairs of the form

              (feat,partitionID)

        where feat is a feature label appearing in a datapoint inside dataList associated with partitionID.

        The inputs are:
            - groupedDataRDD:  RDD containing the grouped data
            - N: Number of partitions of the returned RDD
        
        The returned RDD is partitioned with the default hash function and cached.
    """
    return groupedDataRDD\
            .flatMap(lambda pair: [(data, pair[0]) for data in getAllFeatures(pair[1])])\
            .partitionBy(N).cache()