Example #1
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input",
                                        outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(
         isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame(
         [Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
    def get_pd_keyword(self):

        df_spark = self.df_spark

        # Step 1. Text cleasing with punctuations

        REGEX = '[_,?\\-.!?@#$%^&*+\/\d]'
        df_spark = df_spark.withColumn(
            "description_clean",
            regexp_replace(df_spark.description, REGEX, ' '))

        # Step 2. Tokenization
        # df_spark = df_spark.drop("description_token")

        tokenizer = Tokenizer(inputCol='description_clean',
                              outputCol='description_token')
        df_spark = tokenizer.transform(df_spark)

        # Stemming
        # nltk.download('wordnet')
        lemmatizer = WordNetLemmatizer()

        def lemm_function(list):
            list_clean = []
            for item in list:
                list_clean.append(lemmatizer.lemmatize(item))

            return list_clean

        udf_lemm_function = F.udf(lemm_function, ArrayType(StringType()))

        df_spark = df_spark.withColumn(
            "description_lemm", udf_lemm_function(df_spark.description_token))

        # Step 3. Remove stopword

        stopwords_list = StopWordsRemover.loadDefaultStopWords("english")
        stopwords_customize_list = ["app", "apps"]
        stopwords_list = np.append(stopwords_list, stopwords_customize_list)

        stopwords = StopWordsRemover(inputCol="description_lemm",
                                     outputCol="description_no_stop",
                                     stopWords=stopwords_list)
        stopwords.getStopWords()
        df_spark = stopwords.transform(df_spark)

        df_pd_desc_final = df_spark.toPandas()

        # ### Note: IDF vector must be trained with large corpus, otherwise lose the advance of IDF

        # get the "description" column
        joinF = lambda x: " ".join(x)
        df_pd_desc_final["description_final"] = df_pd_desc_final[
            "description_no_stop"].apply(joinF)

        corpus_list = df_pd_desc_final["description_final"].tolist()

        df_pd_desc_final = get_tfidf(corpus_list, df_pd_desc_final, self.topn)

        return df_pd_desc_final
Example #3
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
Example #4
0
def parse_data(path):
    spark = SparkSession.builder.appName("BigDataProject").getOrCreate()
    if (path == "../train.csv"):
        lcs = udf(lower_clean_str)
        rt = udf(rate_transform)
        # rews=udf(remove_extra_ws) #2/3
        df = spark.read.csv(path, header=False, sep="\t")
        df = df.withColumn("_c1", lcs("_c1"))
        # df=df.withColumn("_c1",rews("_c1")) #3/3
        expres = [split(col("_c1"), " ").alias("_c1")]
        df = df.withColumn("_c1", *expres)
        remover = StopWordsRemover(inputCol="_c1", outputCol="filtered")
        swlist = remover.getStopWords()
        swlist = swlist + list(set(stopwords.words('english'))) + ['']
        remover.setStopWords(swlist)
        #remover.transform(df).select("filtered")

        final = remover.transform(df.select("_c1"))
        df = df.withColumn('row_index', func.monotonically_increasing_id())
        final = final.withColumn('row_index',
                                 func.monotonically_increasing_id())
        final = final.join(df["row_index", "_c0"],
                           on=["row_index"]).drop("row_index").drop("_c1")
        final = final.withColumn("_c0", rt("_c0"))
        # fdf.show()
        return final
    elif (path == "../test.csv"):
        lcs = udf(lower_clean_str)
        # rews=udf(remove_extra_ws) #2/3
        df = spark.read.csv(path, header=False, sep="\t")
        initial = df
        df = df.withColumn("_c0", lcs("_c0"))
        # df=df.withColumn("_c1",rews("_c1")) #3/3
        expres = [split(col("_c0"), " ").alias("_c0")]
        df = df.select(*expres)
        remover = StopWordsRemover(inputCol="_c0", outputCol="filtered")
        swlist = remover.getStopWords()
        swlist = swlist + list(set(stopwords.words('english'))) + ['']
        remover.setStopWords(swlist)
        remover.transform(df).select("filtered")
        final = remover.transform(df.select("_c0"))
        return final, initial

    else:
        print "Wrong File or Path"
        return -1
Example #5
0
def set_pipeline(custom_stop_words=None):
    if not custom_stop_words:
        custom_stop_words = []

    re_tokenizer = RegexTokenizer(inputCol="text", outputCol="raw_tokens", pattern="\\W")
    stop_words_remover = StopWordsRemover(inputCol="raw_tokens", outputCol="words")
    stop_words_remover.setStopWords(stop_words_remover.getStopWords() + custom_stop_words)
    cv = CountVectorizer(inputCol="words", outputCol="vectors")
    pipeline = Pipeline(stages=[re_tokenizer, stop_words_remover, cv])
    return pipeline
Example #6
0
def task_four(ngram):
    """
    Set the ngram value
    :param ngram:
    :return:
    """
    params = list(inspect.getargspec(task_four))
    p = list(chain.from_iterable([i for i in params if i is not None]))
    param_values = {}
    if len(p) > 0:
        for i, v in enumerate(p):
            try:
                value = raw_input("Please enter a value for {} ==> ".format(v))
                param_values.update({v: value})
            except:
                pass
    ngram = param_values.get(p[0])

    if int(ngram) == 2:
        # --- list of stopwords
        stopwords = {
            'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
            'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
            'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'having', 'do', 'does', 'did', 'doing', 'an', 'the', 'and', 'but',
            'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
            'for', 'with', 'about', 'against', 'between', 'into', 'through',
            'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
            'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
            'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
            'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
            'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
            'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don',
            'should', 'now', ' a ', 'insured', 'sured', 'coverage', 'year',
            'dob', 'insd', 'left'
        }

        # --- remove stop words
        REMOVER = StopWordsRemover()
        stopwords = REMOVER.getStopWords()
        REMOVER.setInputCol("inter_wordlist")
        REMOVER.setOutputCol("inter_wordlist_two")

        stpwrds_rmvd_sdf = REMOVER.transform(VECTOR_DATAFRAME) \
                                    .select(["Claim_Id", "filename", "inter_wordlist_two"])

    else:
        pass
Example #7
0
def tokenize_df(df): 
    
    tokenizer = Tokenizer(inputCol="text", outputCol="vector")
    remover = StopWordsRemover()
    remover.setInputCol("vector")
    remover.setOutputCol("vector_no_stopw")
    stopwords = remover.getStopWords()
    stemmer = PorterStemmer()
    stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

    df = df.select(clean_text(col("text")).alias("text"))
    df = tokenizer.transform(df).select("vector")
    df = remover.transform(df).select("vector_no_stopw")
    df = (df
        .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw"))
        .select("vector_stemmed")
        )
    
    return df
def init_base_df(file_path=default_file_path):
    # Set legacy parsing as Spark 3.0+ cannot use 'E' for timestamp
    spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    print("Loading", default_file_path)

    raw_df = (
        spark.read.format("csv")
        .option("inferSchema", True)
        .load(file_path)
        .toDF("polarity", "tweet_id", "datetime", "query", "user", "text")
    )

    # Parse string to timestamp
    time_parsed_df = raw_df.withColumn(
        "timestamp", to_timestamp("datetime", "EEE MMM dd HH:mm:ss zzz yyyy")
    )

    df = time_parsed_df.drop("query").drop("datetime")

    # Shift polarity from a range of [0:4], to [-1:1]
    scaled_polarity_df = df.withColumn("sentiment", (col("polarity") / 2) - 1).drop(
        "polarity"
    )

    clean_text_df = df.select(clean_text(col("text")).alias("text"), "tweet_id")

    tokenizer = Tokenizer(inputCol="text", outputCol="vector")
    vector_df = tokenizer.transform(clean_text_df).select("vector", "tweet_id")

    remover = StopWordsRemover()
    stopwords = remover.getStopWords()

    remover.setInputCol("vector")
    remover.setOutputCol("tokens")

    tokens_no_stopw_df = remover.transform(vector_df).select("tokens", "tweet_id")

    tweets_with_tokens_df = scaled_polarity_df.join(tokens_no_stopw_df, on=["tweet_id"])

    return tweets_with_tokens_df
# Removing punctuation
from pyspark.sql.functions import regexp_replace
# Regular expression (REGEX) to match commas and hyphens
REGEX = '[,\\-]'
books = books.withColumn('text', regexp_replace(books.text, REGEX, ' '))

# Text to tokens
from pyspark.ml.feature import Tokenizer
books = Tokenizer(inputCol='text', outputCol='tokens').transform(books)

# Remove stop words
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover()
# Take a look at the list of stop words
stopwords.getStopWords()
# Spevify the input and output column names
stopwords = stopwords.setInputCol('tokens').setOutputCol('words')
books = stopwords.transform(books)

# Feature hashing
from pyspark.ml.feature import HashingTF
hasher = HashingTF(inputCol='words', outputCol='hash', numFeatures=32)
books = hasher.transoform(books)

# Dealing with common words
from pyspark.ml.feature import IDF 
books = IDF(inputCol='hash', outputCol='features').fit(books).transform(books)

# Import the necessary functions
from pyspark.sql.functions import regexp_replace
Example #10
0
    '''expres = [split(col("sentence"), " ").alias("sentence")]
    sentenceDataFrame = sentenceDataFrame.withColumn("sentence", *expres)
    remover = StopWordsRemover(inputCol="sentence", outputCol="filtered")
    swlist = remover.getStopWords()
    swlist.append("")
    remover.setStopWords(swlist)
    final = remover.transform(sentenceDataFrame.select("sentence"))'''

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    countTokens = udf(lambda words: len(words), IntegerType())

    tokenized = tokenizer.transform(sentenceDataFrame)
    print(tokenized.columns)
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    swlist = remover.getStopWords()
    swlist.append("")
    remover.setStopWords(swlist)
    tokenized = remover.transform(tokenized.select("words"))

    tokenized.select("sentence", "words")\
        .withColumn("tokens", countTokens(col("words"))).show()

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
    tf = hashingTF.transform(tokenized)
    tf.select('rawFeatures').take(2)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(tf)
    tfidf = idfModel.transform(tf)
    print(tfidf.select("features").first())
    spark.stop()
Example #11
0
def Run_N_Grams(root_dir, file_name, n, app_name, extension_type, project_name):
    # type: (...) -> None
    """
    Run the N-Grams language frequency algorithm.

    :param root_dir:        the root directory
    :param file_name:       the name of the csv file to be loaded
    :param n:               the non-negative integer value for generating the N-Grams
    :param app_name:        the name of the spark application
    :param extension_type:  the file extension[.eml, .pdf, .doc, .docx, .rtf]
    :param project_name:    the name of the project[SA_Claims, Personal_Umbrella]

    :return: None
    """
    if os.getcwd() is not root_dir:
        print('changing directory: {}'.format(root_dir))
        os.chdir(root_dir)


    # - list data containers
    raw_data    = []    # holds the raw text data
    data_list   = []    # holds the parsed text data
    row_lengths = []    # stores the row lengths

    # - read the csv file using the 'with' context manager
    if os.path.isfile(os.path.join(root_dir, file_name)):
        with open(os.path.join(root_dir, file_name)) as spark_file:
            for row in spark_file:
                data_list.append(tuple(row.strip('\n').split(',', 3)[1:]))
                row_lengths.append(len(row.rstrip('\n').split(',', 3)[1:]))
                raw_data.append(row)

    # - reset the column names
    claim_id    = 'Claim_Id'
    filenames   = 'filename'
    word_list   = 'raw_wordlist'

    # - rename the columns in data_list
    data_list[0] = claim_id, filenames, word_list

    # - check to make sure row_length consists only of the value 3
    try:
        if len(set(row_lengths)) == 1:
            print("Data successfully loaded")
        if len(set(row_lengths)) > 1:
            raise Exception("LoadDataException: Data not transformed properly.")
    except Exception as e:
        print(e)

    # - create a spark session
    spark = SparkSession.builder \
                        .appName(app_name) \
                        .getOrCreate()

    # - create a spark data frame
    sdf = spark.createDataFrame(data_list[1:], list(map(lambda x: str(x), [claim_id, filenames, word_list])))

    # - setup tokenizer
    tokenizer = Tokenizer(inputCol='raw_wordlist', outputCol="inter_wordlist")

    # - create an output vector
    vector_df = tokenizer.transform(sdf).select(["Claim_Id", "filename", "inter_wordlist"])

    # - stop words to be removed
    remover = StopWordsRemover()
    stopwords = remover.getStopWords()
    stopwords = {'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself',
                 'yourselves', 'he', 'him', 'his', 'himself'
        , 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
                 'what', 'which', 'who', 'whom', 'this', 'that'
        , 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
                 'do', 'does', 'did', 'doing', 'an', 'the', 'and'
        , 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                 'between', 'into', 'through', 'during'
        , 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
                 'again', 'further', 'then', 'once'
        , 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
                 'some', 'such', 'no', 'nor', 'not'
        , 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now', ' a ',
                 'insured', 'sured', 'coverage',
                 'year', 'dob', 'insd', 'left'}

    # - create the N-gram
    ngram = NGram(n=n, inputCol="inter_wordlist", outputCol="wordlist")
    dev_sdf = ngram.transform(vector_df)
    dev_sdf = dev_sdf.where(size(col("wordlist")) >= n)
    filtered_sdf = dev_sdf.select("Claim_Id", "filename", explode(dev_sdf.inter_wordlist).alias('wordlist'))

    # - save to hive: Update the save name
    hdfs_location = '/bda/claimsops/data/'
    current_date  = str(datetime.today())[:10].replace('-', '_')
    n_gram_str = ''
    if n == 2:
        n_gram_str = 'BiGrams'
    else:
        n_gram_str = str(n).upper() + 'Grams'

    hive_str = hdfs_location + project_name + '/' + extension_type + '_' + n_gram_str + '_' + current_date
    print(hive_str)
Example #12
0
    ],
    ['cls','sent']
)

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, HashingTF, StopWordsRemover, RegexTokenizer
stopwords=list()
_mystopwords=[u"나",u"너", u"우리"]
for e in _mystopwords:
    stopwords.append(e)

labelIndexer = StringIndexer(inputCol="cls", outputCol="label")
regexTok = RegexTokenizer(inputCol="sent", outputCol="wordsRegex", pattern="\\s+")
#tokenizer = Tokenizer(inputCol="sent", outputCol="words")
stop = StopWordsRemover(inputCol="wordsRegex", outputCol="nostops")
_stopwords=stop.getStopWords()
for e in _stopwords:
    stopwords.append(e)
stop.setStopWords(stopwords)

hashingTF = HashingTF(inputCol="nostops", outputCol="features")
pipeline = Pipeline(stages=[labelIndexer,regexTok,stop,hashingTF])
model=pipeline.fit(df)
trainDf = model.transform(df)

trainDf.select('cls','label','features').show()

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors

trainRdd = trainDf\
# MAGIC %md
# MAGIC #### 3. Get insigts of Users
# MAGIC Get all onwers (labeled as one as well as predicted as one) and get the words frequency

# COMMAND ----------

from pyspark.ml.feature import StopWordsRemover

df_all_owner = df_pets.select('text').union(
    pred_all.filter(F.col('prediction') == 1.0).select('text'))

stopwords_custom = ['im', 'get', 'got', 'one', 'hes', 'shes', 'dog', 'dogs', 'cats', 'cat', 'kitty', 'much', 'really', 'love','like','dont','know','want','thin',\
                    'see','also','never','go','ive']

remover1 = StopWordsRemover(inputCol="raw", outputCol="filtered")
core = remover1.getStopWords()
core = core + stopwords_custom
remover = StopWordsRemover(inputCol="text",
                           outputCol="filtered",
                           stopWords=core)
df_all_owner = remover.transform(df_all_owner)

wc = df_all_owner.select('filtered').rdd.flatMap(
    lambda a: a.filtered).countByValue()

# COMMAND ----------

df_all_owner.show(1)

# COMMAND ----------
  plt.axis("off")

# Plot the word cloud:
plot_word_cloud(tokenized, "words")


# ### Remove common (stop) words from each review

# Note that the ride reviews contain a number of common words such as "the"
# that we do not expect to be relevant.
# Use the
# [StopWordsRemover](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
# class to remove these so-called *stop words*:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol="words", outputCol="relevant_words")
remover.getStopWords()[:10]
removed = remover.transform(tokenized)
removed.select("words", "relevant_words").head(5)

# Plot the word cloud:
plot_word_cloud(removed, "relevant_words")


# ### Count the frequency of words in each review

# Use the
# [CountVectorizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.CountVectorizer)
# class to compute the term frequency:
from pyspark.ml.feature import CountVectorizer
vectorizer = CountVectorizer(inputCol="relevant_words", outputCol="word_count_vector", vocabSize=100)
Example #15
0
	train_df = train_df.withColumn("numeric_label", udfNumericLabel("label"))
	test_df = test_df.withColumn("numeric_label", udfNumericLabel("label"))
	train_df = train_df.withColumnRenamed('label','text_label')
	train_df = train_df.withColumnRenamed('numeric_label','label')
	test_df = test_df.withColumnRenamed('label','text_label')
	test_df = test_df.withColumnRenamed('numeric_label','label')
	

	# Tockenize the doc into words
	tokenizer = Tokenizer(inputCol="doc",outputCol="words")
	wordsDataTrain = tokenizer.transform(train_df)
	wordsDataTest = tokenizer.transform(test_df)

	# Remove Stop Words
	remover = StopWordsRemover(inputCol="words",outputCol="filtered")
	sw =remover.getStopWords()
	sw.append('subject')
	sw.append('re')
	sw.append('email')
	remover.setStopWords(sw)
	cleanDataTrain = remover.transform(wordsDataTrain)
	cleanDataTest = remover.transform(wordsDataTest)

	# Made onegrams
	onegram = NGram(n=1, inputCol="filtered",outputCol="onegram")
	onegramedDataTrain=onegram.transform(cleanDataTrain)
	onegramedDataTest = onegram.transform(cleanDataTest)

	#  Find hashed Term frequency value of word vector
	hashingTF = HashingTF(inputCol="onegram", outputCol="rawFeatures", numFeatures=100000)
	featurizedDataTrain = hashingTF.transform(onegramedDataTrain)
from pyspark.sql.functions import *
from pyspark.sql.types import *

tokenizer = Tokenizer(inputCol="cluster_text", outputCol="words")

countTokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(documents)

# StopWordsRemover
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Add stopwords to the existing list.
add_stopwords = ["like", "-", "the", "to", "@", "get", "got", "i´m", "don´t"]
newStopwords = remover.getStopWords() + add_stopwords
remover.setStopWords(newStopwords)
remover.getStopWords()

# transform twitter text by removing stopwords
tokenized = remover.transform(tokenized)

# Explode and aggregate words
tokenized = tokenized.withColumn("word", F.explode('filtered'))

# Add counter column with 1 initiated.
tokenized = tokenized.withColumn("count", F.lit(1))

# Count words under the same prediction(cluster) group.
countedWords = tokenized.groupBy("prediction", "word")\
  .agg(F.count("count").alias("wordCount"))#.orderBy("wordCount", ascending=False).show()
Example #17
0
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="vector")
vector_df = tokenizer.transform(clean_text_df).select("vector")

vector_df.printSchema()
vector_df.show(10)

"""**3. Remove** **stop words**"""

from pyspark.ml.feature import StopWordsRemover

# Define a list of stop words or use default list
remover = StopWordsRemover()
stopwords = remover.getStopWords()

# Display default list
stopwords[:10]

# Specify input/output columns
remover.setInputCol("vector")
remover.setOutputCol("Body_no_stopw")

# Transform existing dataframe with the StopWordsRemover
Body_no_stopw_df = remover.transform(vector_df).select("Body_no_stopw")

# Display
Body_no_stopw_df.printSchema()
Body_no_stopw_df.show()
                                        F.lower(
                                            F.col("tweet_text"))).withColumn(
                                                "tweet_text",
                                                F.trim(F.col("tweet_text"))))

    #============================================
    # preprocessing
    #============================================
    # 2.1. tokenize
    tokenizer = Tokenizer(inputCol="tweet_text", outputCol="tokens")

    # 2.2. remove stopwords
    stopword_remover = StopWordsRemover(inputCol="tokens",
                                        outputCol="remove_stop")

    stopwords_list = stopword_remover.getStopWords()
    stopwords_list = stopwords_list + more_stopwords
    stopword_remover.setStopWords(stopwords_list)
    #2.3. stemming
    # TODO: how to modify the stemming function into a transformer?
    stemmer = PorterStemmer()
    # more straightforward to use lambda
    stem_udf = F.udf(lambda l: [stemmer.stem(word) for word in l],
                     returnType=ArrayType(StringType()))

    df_tokenized = tokenizer.transform(df_select_clean)
    df_rmstop = stopword_remover.transform(df_tokenized)
    df_stemmed = df_rmstop.withColumn("stemmed",
                                      stem_udf(F.col("remove_stop")))

    # Load the trained LDAmodel