def _compute_tfid(texts: RDD) -> IDFModel: tf = HashingTF().transform(texts.map(lambda t: t.words)) tf.cache() idf = IDF().fit(tf) tfidfs = idf.transform(tf) text_tfs = texts.zip(tfidfs) return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
def covariance(rdd1: RDD, mean1: float, rdd2: RDD, mean2: float) -> float: rdd_zipped = rdd1.zip(rdd2) return rdd_zipped.map(lambda x, y: (x-mean1)*(y-mean2)).sum() / rdd_zipped.count()