# StopWordsRemover is feature transformer that filters out stop words from input. stop_words_removed = StopWordsRemover( inputCol="words", outputCol="stop_words_removed").transform(words) stop_words_removed.show(truncate=False) # 變成n字一組 # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned. ngram_df = NGram(n=2, inputCol="words", outputCol="ngrams").transform(words) ngram_df.show(truncate=False) ngram_df.select("ngrams").show(truncate=False) # TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. df = words.select("words") df.show(truncate=False) # Hashing TF is TF with hashing enabled to allow the feature vector to be a set value df_tf = HashingTF( inputCol="words", outputCol="hashing_tf", numFeatures=15 #預設是262144維 ).transform(df) df_tf.show() df_tf.select("words").show(truncate=False) df_tf.select("hashing_tf").show(truncate=False) #第一個list代表詞的index,第2個list代表詞出現次數 # IDF
stopwords = StopWordsRemover() stopwords = stopwords.setInputCol('tokens').setOutputCol('words') df = stopwords.transform(df) # hash the tokens hasher = HashingTF(inputCol="words", outputCol="hash", numFeatures=32) df = hasher.transform(df) # create an IDF based on the hashed tokens df = IDF(inputCol="hash", outputCol="features").fit(df).transform(df) # recode favorite count, greater than 100 as 1, less than 100 as 0, call it label df = df.withColumn('label', (df['favorite_count'] >= 100).cast('integer')) # select only the needed columns df = df.select('text', 'label', 'features') # create the test/train split df_train, df_test = df.randomSplit([0.8, 0.2], seed=23) # run logistic regression on the training set lr = LogisticRegression() lr_model = lr.fit(df_train) # create predictions based on the test set predictions = lr_model.transform(df_test) # group the predictions predictions.groupBy('label', 'prediction').count().show() # print the test results table, and the AUC-ROC curve area