(0,
         "From all sides were heard the footsteps and talk of the infantry, who were walking, driving past, and settling down all around."
         ),
        (1,
         "IIt was no longer, as before, a dark, unseen river flowing through the gloom, but a dark sea swelling and gradually subsiding after a storm."
         ),
        (2,
         "\"You don't mind your honor?\" he asked Tushin. \"I've lost my company, your honor. I don't know where... such bad luck!'\""
         )
    ], "id int, message string")

    df.show(truncate=False)

    #A tokenizer that converts the input string to lowercase and then splits it by white spaces.
    words = Tokenizer(inputCol="message", outputCol="words").transform(df)
    words.show(truncate=False)

    # 正規表達法資料清洗及斷詞
    # A regex based tokenizer that extracts tokens either by using the provided regex pattern (in Java dialect) to split the text (default) or repeatedly matching the regex (if gaps is false). Optional parameters also allow filtering tokens using a minimal length. It returns an array of strings that can be empty.
    words = RegexTokenizer(inputCol="message",
                           outputCol="words",
                           pattern="\\W+").transform(df)
    words.show(truncate=False)

    # StopWordsRemover is feature transformer that filters out stop words from input.
    stop_words_removed = StopWordsRemover(
        inputCol="words", outputCol="stop_words_removed").transform(words)
    stop_words_removed.show(truncate=False)

    # 變成n字一組
    # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned.
Ejemplo n.º 2
0
# Exercise_10 
# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

wrangled.show(4, truncate=False)

--------------------------------------------------
# Exercise_11 
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
      .transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
Ejemplo n.º 3
0
books.printSchema()

# Regular expression (REGEX) to match commas and hyphens
REGEX1 = '[,]'
REGEX2 = '[\\-]'
books = books.withColumn('text', regexp_replace(books.title, REGEX1, ''))
books = books.withColumn('text', regexp_replace(books.text, REGEX2, ' '))

# Text to tokens
books = Tokenizer(inputCol="text", outputCol="tokens").transform(books)

# Take a look at the list of stop words
stopwords = StopWordsRemover()
stopwords.getStopWords()

# Specify the input and output column names
stopwords = stopwords.setInputCol('tokens').setOutputCol('words')
books = stopwords.transform(books)

# Feature hashing
hasher = HashingTF(inputCol="words", outputCol="hash", numFeatures=32)
books = hasher.transform(books)

# Dealing with common words
books = IDF(inputCol="hash", outputCol="features").fit(books).transform(books)

# View the first five records
books.show(truncate=False)

spark.stop()