reviews = reviews.join(prices, ["asin"], how="leftsemi") # # Use nltk.word_tokenizer to tokenize words # @udf(ArrayType(StringType())) # def tokenize(string): # return word_tokenize(string) # reviews = reviews.withColumn("words", tokenize("reviewText")) reviews = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(reviews) reviews = reviews.drop("reviewText") reviews = reviews.withColumn("num_words", size("words")) reviews = reviews.drop("words") reviews = reviews.groupBy("asin").agg(avg("num_words").alias("average_review_length")) reviews = reviews.drop("num_words") data = reviews.join(prices, ["asin"]) data = data.drop("asin") data = data.repartition(20) xy = data.rdd.map(lambda row: (row.average_review_length, row.price)) xy = xy.coalesce(8) x = xy.map(lambda v: v[0]) y = xy.map(lambda v: v[1]) n = x.count() sum_x = x.reduce(lambda a, b: a + b) sum_y = y.reduce(lambda a, b: a + b)