Ejemplo n.º 1
0
df = tokenizer.transform(df).select("words", "overall")
df.show()

## COUNT
from pyspark.ml.feature import CountVectorizer

print("countvectorize")
df = CountVectorizer(inputCol="words",
                     outputCol="countVector",
                     vocabSize=2000,
                     minDF=8.0)\
                     .fit(df)\
                     .transform(df)\
                     .select("countVector","overall")

df.show(truncate=False)

###

from pyspark.ml.feature import IDF
print("tfidf")
df = IDF(inputCol="countVector",
         outputCol="tfidf").fit(df).transform(df).select("tfidf", "overall")
df.show()

###

from pyspark.ml.feature import PCA
print("pca")
df = PCA(k=300, inputCol="tfidf",
         outputCol="pca").fit(df).transform(df).select("pca", "overall")
    # Hashing TF is TF with hashing enabled to allow the feature vector to be a set value
    df_tf = HashingTF(
        inputCol="words",
        outputCol="hashing_tf",
        numFeatures=15  #預設是262144維
    ).transform(df)

    df_tf.show()
    df_tf.select("words").show(truncate=False)
    df_tf.select("hashing_tf").show(truncate=False)
    #第一個list代表詞的index,第2個list代表詞出現次數

    # IDF
    df_tf_idf = IDF(inputCol="hashing_tf",
                    outputCol="tf_idf").fit(df_tf).transform(df_tf)

    df_tf_idf.show()
    df_tf_idf.select("words").show(truncate=False)
    df_tf_idf.select("hashing_tf").show(truncate=False)  # Hashing TF
    df_tf_idf.select("tf_idf").show(truncate=False)  # IDF

    # TF from CountVectorizer, which is used to extract words and counts from document collection
    df = words.select("words")
    df.show(truncate=False)

    df_tf_cv = CountVectorizer(inputCol="words",
                               outputCol="tf_cv").fit(df).transform(df)
    df_tf_cv.show()
    df_tf_cv.select("words").show(truncate=False)
    df_tf_cv.select("tf_cv").show(truncate=False)