Ejemplo n.º 1
0
    # @udf(ArrayType(StringType()))
    # def tokenize(string):
    #     return word_tokenize(string)

    # df = df.withColumn("words", tokenize("reviewText"))

    df = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(df)
    df = df.drop("reviewText")

    cv_model = CountVectorizer(inputCol="words", outputCol="tf").fit(df)
    vocabulary = cv_model.vocabulary

    df = cv_model.transform(df)
    df = df.drop("words")
    df.cache()

    df = IDF(inputCol="tf", outputCol="tfidf").fit(df).transform(df)
    df = df.drop("tf")
    df.unpersist()

    @udf(MapType(StringType(), FloatType()))
    def create_map(vector):
        zipped = zip(vector.indices, vector.values)
        return dict((vocabulary[int(x)], float(y)) for (x, y) in zipped)

    results = df.withColumn("tfidf", create_map("tfidf"))

    results.write.json("hdfs:/output/tfidf", mode="overwrite")

    spark.stop()