def evaluate(truth: RDD, prediction: RDD) -> float:
     """
     Calculate RMSE between truth and predictions.
     :param truth: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)>
     :param prediction: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)>
     :return: float = RMSE
     """
     truth = truth.map(lambda u: ((u[0], u[1]), u[2]))
     prediction = prediction.map(lambda u: ((u[0], u[1]), u[2]))
     return truth.join(prediction).map(lambda u:
                                       (u[1][0] - u[1][1])**2).mean()**0.5
Ejemplo n.º 2
0
def _join(rdd: RDD, other: RDD, func=None):
    num_partitions = max(rdd.getNumPartitions(), other.getNumPartitions())
    rtn_rdd = rdd.join(other, numPartitions=num_partitions)
    if func is not None:
        rtn_rdd = _map_value(rtn_rdd, lambda x: func(x[0], x[1]))
    return rtn_rdd