Exemple #1
0
    reviews = reviews.join(prices, ["asin"], how="leftsemi")

    # # Use nltk.word_tokenizer to tokenize words
    # @udf(ArrayType(StringType()))
    # def tokenize(string):
    #     return word_tokenize(string)

    # reviews = reviews.withColumn("words", tokenize("reviewText"))

    reviews = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(reviews)
    reviews = reviews.drop("reviewText")

    reviews = reviews.withColumn("num_words", size("words"))
    reviews = reviews.drop("words")

    reviews = reviews.groupBy("asin").agg(avg("num_words").alias("average_review_length"))
    reviews = reviews.drop("num_words")

    data = reviews.join(prices, ["asin"])
    data = data.drop("asin")
    data = data.repartition(20)

    xy = data.rdd.map(lambda row: (row.average_review_length, row.price))
    xy = xy.coalesce(8)
    x = xy.map(lambda v: v[0])
    y = xy.map(lambda v: v[1])
    n = x.count()

    sum_x = x.reduce(lambda a, b: a + b)
    sum_y = y.reduce(lambda a, b: a + b)