df = tokenizer.transform(df).select("words", "overall") df.show() ## COUNT from pyspark.ml.feature import CountVectorizer print("countvectorize") df = CountVectorizer(inputCol="words", outputCol="countVector", vocabSize=2000, minDF=8.0)\ .fit(df)\ .transform(df)\ .select("countVector","overall") df.show(truncate=False) ### from pyspark.ml.feature import IDF print("tfidf") df = IDF(inputCol="countVector", outputCol="tfidf").fit(df).transform(df).select("tfidf", "overall") df.show() ### from pyspark.ml.feature import PCA print("pca") df = PCA(k=300, inputCol="tfidf", outputCol="pca").fit(df).transform(df).select("pca", "overall")
# Hashing TF is TF with hashing enabled to allow the feature vector to be a set value df_tf = HashingTF( inputCol="words", outputCol="hashing_tf", numFeatures=15 #預設是262144維 ).transform(df) df_tf.show() df_tf.select("words").show(truncate=False) df_tf.select("hashing_tf").show(truncate=False) #第一個list代表詞的index,第2個list代表詞出現次數 # IDF df_tf_idf = IDF(inputCol="hashing_tf", outputCol="tf_idf").fit(df_tf).transform(df_tf) df_tf_idf.show() df_tf_idf.select("words").show(truncate=False) df_tf_idf.select("hashing_tf").show(truncate=False) # Hashing TF df_tf_idf.select("tf_idf").show(truncate=False) # IDF # TF from CountVectorizer, which is used to extract words and counts from document collection df = words.select("words") df.show(truncate=False) df_tf_cv = CountVectorizer(inputCol="words", outputCol="tf_cv").fit(df).transform(df) df_tf_cv.show() df_tf_cv.select("words").show(truncate=False) df_tf_cv.select("tf_cv").show(truncate=False)