(0, "From all sides were heard the footsteps and talk of the infantry, who were walking, driving past, and settling down all around." ), (1, "IIt was no longer, as before, a dark, unseen river flowing through the gloom, but a dark sea swelling and gradually subsiding after a storm." ), (2, "\"You don't mind your honor?\" he asked Tushin. \"I've lost my company, your honor. I don't know where... such bad luck!'\"" ) ], "id int, message string") df.show(truncate=False) #A tokenizer that converts the input string to lowercase and then splits it by white spaces. words = Tokenizer(inputCol="message", outputCol="words").transform(df) words.show(truncate=False) # 正規表達法資料清洗及斷詞 # A regex based tokenizer that extracts tokens either by using the provided regex pattern (in Java dialect) to split the text (default) or repeatedly matching the regex (if gaps is false). Optional parameters also allow filtering tokens using a minimal length. It returns an array of strings that can be empty. words = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W+").transform(df) words.show(truncate=False) # StopWordsRemover is feature transformer that filters out stop words from input. stop_words_removed = StopWordsRemover( inputCol="words", outputCol="stop_words_removed").transform(words) stop_words_removed.show(truncate=False) # 變成n字一組 # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned.
# Exercise_10 # Import the necessary functions from pyspark.sql.functions import regexp_replace from pyspark.ml.feature import Tokenizer # Remove punctuation (REGEX provided) and numbers wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' ')) wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' ')) # Merge multiple spaces wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' ')) # Split the text into words wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled) wrangled.show(4, truncate=False) -------------------------------------------------- # Exercise_11 from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF # Remove stop words. wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\ .transform(sms) # Apply the hashing trick wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\ .transform(wrangled) # Convert hashed symbols to TF-IDF tf_idf = IDF(inputCol='hash', outputCol='features')\
books.printSchema() # Regular expression (REGEX) to match commas and hyphens REGEX1 = '[,]' REGEX2 = '[\\-]' books = books.withColumn('text', regexp_replace(books.title, REGEX1, '')) books = books.withColumn('text', regexp_replace(books.text, REGEX2, ' ')) # Text to tokens books = Tokenizer(inputCol="text", outputCol="tokens").transform(books) # Take a look at the list of stop words stopwords = StopWordsRemover() stopwords.getStopWords() # Specify the input and output column names stopwords = stopwords.setInputCol('tokens').setOutputCol('words') books = stopwords.transform(books) # Feature hashing hasher = HashingTF(inputCol="words", outputCol="hash", numFeatures=32) books = hasher.transform(books) # Dealing with common words books = IDF(inputCol="hash", outputCol="features").fit(books).transform(books) # View the first five records books.show(truncate=False) spark.stop()