Exemple #1
0
        schema = StructType([StructField("asin", StringType(), True),
                             StructField("price", FloatType(), True)]))

    reviews = reviews.join(prices, ["asin"], how="leftsemi")

    # # Use nltk.word_tokenizer to tokenize words
    # @udf(ArrayType(StringType()))
    # def tokenize(string):
    #     return word_tokenize(string)

    # reviews = reviews.withColumn("words", tokenize("reviewText"))

    reviews = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(reviews)
    reviews = reviews.drop("reviewText")

    reviews = reviews.withColumn("num_words", size("words"))
    reviews = reviews.drop("words")

    reviews = reviews.groupBy("asin").agg(avg("num_words").alias("average_review_length"))
    reviews = reviews.drop("num_words")

    data = reviews.join(prices, ["asin"])
    data = data.drop("asin")
    data = data.repartition(20)

    xy = data.rdd.map(lambda row: (row.average_review_length, row.price))
    xy = xy.coalesce(8)
    x = xy.map(lambda v: v[0])
    y = xy.map(lambda v: v[1])
    n = x.count()
Exemple #2
0
    # @udf(ArrayType(StringType()))
    # def tokenize(string):
    #     return word_tokenize(string)

    # df = df.withColumn("words", tokenize("reviewText"))

    df = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(df)
    df = df.drop("reviewText")

    cv_model = CountVectorizer(inputCol="words", outputCol="tf").fit(df)
    vocabulary = cv_model.vocabulary

    df = cv_model.transform(df)
    df = df.drop("words")
    df.cache()

    df = IDF(inputCol="tf", outputCol="tfidf").fit(df).transform(df)
    df = df.drop("tf")
    df.unpersist()

    @udf(MapType(StringType(), FloatType()))
    def create_map(vector):
        zipped = zip(vector.indices, vector.values)
        return dict((vocabulary[int(x)], float(y)) for (x, y) in zipped)

    results = df.withColumn("tfidf", create_map("tfidf"))

    results.write.json("hdfs:/output/tfidf", mode="overwrite")

    spark.stop()