# StopWordsRemover is feature transformer that filters out stop words from input.
    stop_words_removed = StopWordsRemover(
        inputCol="words", outputCol="stop_words_removed").transform(words)
    stop_words_removed.show(truncate=False)

    # 變成n字一組
    # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned.
    ngram_df = NGram(n=2, inputCol="words",
                     outputCol="ngrams").transform(words)

    ngram_df.show(truncate=False)
    ngram_df.select("ngrams").show(truncate=False)

    # TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling.
    df = words.select("words")
    df.show(truncate=False)

    # Hashing TF is TF with hashing enabled to allow the feature vector to be a set value
    df_tf = HashingTF(
        inputCol="words",
        outputCol="hashing_tf",
        numFeatures=15  #預設是262144維
    ).transform(df)

    df_tf.show()
    df_tf.select("words").show(truncate=False)
    df_tf.select("hashing_tf").show(truncate=False)
    #第一個list代表詞的index,第2個list代表詞出現次數

    # IDF
stopwords = StopWordsRemover()
stopwords = stopwords.setInputCol('tokens').setOutputCol('words')
df = stopwords.transform(df)

# hash the tokens
hasher = HashingTF(inputCol="words", outputCol="hash", numFeatures=32)
df = hasher.transform(df)

# create an IDF based on the hashed tokens
df = IDF(inputCol="hash", outputCol="features").fit(df).transform(df)

# recode favorite count, greater than 100 as 1, less than 100 as 0, call it label
df = df.withColumn('label', (df['favorite_count'] >= 100).cast('integer'))

# select only the needed columns
df = df.select('text', 'label', 'features')

# create the test/train split
df_train, df_test = df.randomSplit([0.8, 0.2], seed=23)

# run logistic regression on the training set
lr = LogisticRegression()
lr_model = lr.fit(df_train)

# create predictions based on the test set
predictions = lr_model.transform(df_test)

# group the predictions
predictions.groupBy('label', 'prediction').count().show()

# print the test results table, and the AUC-ROC curve area