def test_stopwordsremover(self): dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") # Default self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) # with locale stopwords = ["BELKİ"] dataset = self.spark.createDataFrame([Row(input=["belki"])]) stopWordRemover.setStopWords(stopwords).setLocale("tr") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, [])
def test_stopwordsremover(self): dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") # Default self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], str)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = self.spark.createDataFrame( [Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) # with locale stopwords = ["BELKİ"] dataset = self.spark.createDataFrame([Row(input=["belki"])]) stopWordRemover.setStopWords(stopwords).setLocale("tr") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, [])
def transform(self): df2 = self.dataframe.withColumn( "_2", regexp_replace(col("_2"), "[\"'./§$&+,:;=?@#–|'<>.^*()%!-]", "")) df = df2.withColumn("_2", regexp_replace(col("_2"), "\\s{2,}", "")) language_detect = udf(lambda x: detect(x), returnType=StringType()) df3 = df.withColumn("lang", language_detect('_2')) lemmatizer = Lemmatizer(lookup=delook) lemmatizer1 = Lemmatizer(lookup=enlook) tokenizer = Tokenizer(inputCol="_2", outputCol="words") tokenized = tokenizer.transform(df3) # print(tokenized) lemma = udf(lambda x, lang: True if lang == "de" " ".join([lemmatizer.lookup(i) for i in x]) else " ".join( [lemmatizer1.lookup(i) for i in x]), returnType=StringType()) lemmatized = tokenized.withColumn( "stemmed", lemma(col('words'), col('lang'))).drop('words').drop('_2') tokenizer = Tokenizer(inputCol="stemmed", outputCol="words") tokenized = tokenizer.transform(lemmatized) remover = StopWordsRemover(inputCol="words", outputCol="filtered") stopwords = remover.loadDefaultStopWords( "german") + remover.loadDefaultStopWords("english") remover = remover.setStopWords(stopwords) newDataSet = remover.transform(tokenized) test = newDataSet.withColumn("filtered", explode(col("filtered"))) \ .groupBy("_1", "filtered") \ .agg(func.count(func.lit(1)).alias("count")) \ .sort(col("count").desc()) return test
df = df.withColumn('lemmed_tokens', lemma_udf(df.tokens)) # remove stopwords print('Removing Stop Words...') swr = StopWordsRemover(inputCol='lemmed_tokens', outputCol='filtered_tokens') stops = swr.loadDefaultStopWords('english') for stop in stops: stop.replace('’', '') for word in [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ha', 'wa', 'getty', 'image', 'ap', 'pictwittercom' ]: stops.append(word) swr.setStopWords(stops) df = swr.transform(df) df = df.select('post_id', 'filtered_tokens') print("Post Stop Word Remove") df.take(1) df.cache() #options for NLP here: word 2 vec to drop synonyms, n-grams, etc #tokens to counts print('Processing Through Count Vectorizer...') cv = CountVectorizer(inputCol='filtered_tokens', outputCol='counts', minDF=5,
print(files_path_test) print("model will be save in \n", s3_bucket + pipelinePath) print(50*"=") df_train = spark.read.format("parquet").load(s3_bucket + files_path_train).repartition(4) df_test = spark.read.format("parquet").load(s3_bucket + files_path_test).repartition(4) # ============================================ # extra step to remove frequent words # ============================================ # one extra step to remove the frequent words stopword_remover_stem = StopWordsRemover(inputCol="stemmed", outputCol="stemmed_rm") stopword_remover_stem.setStopWords(extra_for_stemmed) df_train = stopword_remover_stem.transform(df_train) df_test = stopword_remover_stem.transform(df_test) # ============================================ # cache and print some basic information # ============================================ df_train.cache() df_test.cache() print("Train/test data info:") print(50*"=") print(f"nums of training data: {df_train.count(): 10d}") print(f"nums of test data: {df_test.count(): 10d}") print(50*"=")
'''expres = [split(col("sentence"), " ").alias("sentence")] sentenceDataFrame = sentenceDataFrame.withColumn("sentence", *expres) remover = StopWordsRemover(inputCol="sentence", outputCol="filtered") swlist = remover.getStopWords() swlist.append("") remover.setStopWords(swlist) final = remover.transform(sentenceDataFrame.select("sentence"))''' tokenizer = Tokenizer(inputCol="sentence", outputCol="words") countTokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(sentenceDataFrame) print(tokenized.columns) remover = StopWordsRemover(inputCol="words", outputCol="filtered") swlist = remover.getStopWords() swlist.append("") remover.setStopWords(swlist) tokenized = remover.transform(tokenized.select("words")) tokenized.select("sentence", "words")\ .withColumn("tokens", countTokens(col("words"))).show() hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") tf = hashingTF.transform(tokenized) tf.select('rawFeatures').take(2) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(tf) tfidf = idfModel.transform(tf) print(tfidf.select("features").first()) spark.stop()
df.show(10, False) tokenizer = RegexTokenizer(inputCol="allTextString", outputCol="word_tokens", pattern="\\W") TokenizerData = tokenizer.transform(df) df = TokenizerData remover = StopWordsRemover(inputCol="word_tokens", outputCol="stop_removed") my_sw = [ 'united', 'states', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] sw = remover.loadDefaultStopWords("english") remover.setStopWords(sw + my_sw) StopWordsRemoverData = remover.transform(df) df = StopWordsRemoverData cv = CountVectorizer(inputCol="stop_removed", outputCol="CountVectorizer", vocabSize=1000, minDF=1.0, minTF=1.0) transformer = cv.fit(df) print(" ----------- ", transformer.vocabulary) vacabulary = transformer.vocabulary CountVectorizerData = transformer.transform(df) df = CountVectorizerData # Trains a LDA model.
from pyspark.sql.types import * tokenizer = Tokenizer(inputCol="cluster_text", outputCol="words") countTokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(documents) # StopWordsRemover from pyspark.ml.feature import StopWordsRemover remover = StopWordsRemover(inputCol="words", outputCol="filtered") # Add stopwords to the existing list. add_stopwords = ["like", "-", "the", "to", "@", "get", "got", "i´m", "don´t"] newStopwords = remover.getStopWords() + add_stopwords remover.setStopWords(newStopwords) remover.getStopWords() # transform twitter text by removing stopwords tokenized = remover.transform(tokenized) # Explode and aggregate words tokenized = tokenized.withColumn("word", F.explode('filtered')) # Add counter column with 1 initiated. tokenized = tokenized.withColumn("count", F.lit(1)) # Count words under the same prediction(cluster) group. countedWords = tokenized.groupBy("prediction", "word")\ .agg(F.count("count").alias("wordCount"))#.orderBy("wordCount", ascending=False).show()
sc = spark.sparkContext hadoop_conf = sc._jsc.hadoopConfiguration() hadoop_conf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') hadoop_conf.set('fs.s3a.awsAccessKeyId', access_id) hadoop_conf.set('fs.s3a.awsSecretAccessKey', access_key) infilepath = 's3a://bkrull-insight-bucket/tokenize/{year}-{month:02d}' outfilepath = 's3a://bkrull-insight-bucket/results/{subreddit}/{week}' cols = ['subreddit', 'date', 'results'] # Create Spark ML Pipeline stages ---------------------------------- remover = StopWordsRemover(inputCol='body', outputCol='filtered') stopwords = remover.getStopWords() stopwords.extend(extra_stopwords) remover.setStopWords(stopwords).setCaseSensitive(True) cv = CountVectorizer(inputCol='filtered', outputCol='features', minDF=1.0) lda = LDA(k=5, maxIter=10, optimizer='online') pipeline = Pipeline(stages=[remover, cv, lda]) # ------------------------------------------------------------------ # Get and filter data ---------------------------------------------- df = spark.read.json(infilepath.format(year=year, month=month)) subreddit_df = df.filter(df.subreddit == subreddit) tokens = subreddit_df.filter(df.created_utc.between(lit(week), lit(week_end))) \ .select('body') # ------------------------------------------------------------------ num_docs = tokens.count() if num_docs >= 1: model = pipeline.fit(tokens)
from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, HashingTF, StopWordsRemover, RegexTokenizer stopwords=list() _mystopwords=[u"나",u"너", u"우리"] for e in _mystopwords: stopwords.append(e) labelIndexer = StringIndexer(inputCol="cls", outputCol="label") regexTok = RegexTokenizer(inputCol="sent", outputCol="wordsRegex", pattern="\\s+") #tokenizer = Tokenizer(inputCol="sent", outputCol="words") stop = StopWordsRemover(inputCol="wordsRegex", outputCol="nostops") _stopwords=stop.getStopWords() for e in _stopwords: stopwords.append(e) stop.setStopWords(stopwords) hashingTF = HashingTF(inputCol="nostops", outputCol="features") pipeline = Pipeline(stages=[labelIndexer,regexTok,stop,hashingTF]) model=pipeline.fit(df) trainDf = model.transform(df) trainDf.select('cls','label','features').show() from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import Vectors trainRdd = trainDf\ .rdd\ .map(lambda row: LabeledPoint(row.label,Vectors.fromML(row.features))) trainRdd.take(1)
def get_trending_news(rdd): if not rdd.isEmpty(): spark = getSparkSessionInstance(rdd.context.getConf()) df = spark.createDataFrame(rdd) # Append the title and summary together df_news_concat = df.withColumn("news_content", fn.concat_ws(" ", df.title, df.summary)) df_punc_removed = df_news_concat.withColumn( "news_content_removed", fn.regexp_replace(df_news_concat.news_content, "\p{Punct}", "")) udf_remove_unicode = fn.udf( lambda x: x.encode("ascii", "ignore").decode("ascii")) df_news_content_ascii = df_punc_removed.withColumn( "news_content_ascii", udf_remove_unicode(df_punc_removed.news_content_removed)) # insert raw data to the cassandra table df_news_content_ascii.select("id", "news_provider", "published", "summary", "title") \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table="travel_news_data", keyspace="news_stream_analysis") \ .save(mode="append") tokenizer = Tokenizer(inputCol="news_content_ascii", outputCol="content_words") df_tokenized_content = tokenizer.transform(df_news_content_ascii).drop( "news_content") remover = StopWordsRemover(inputCol="content_words", outputCol="filtered_words") stop_words = remover.loadDefaultStopWords("english") stop_words.extend([ '', "travel", "trip", "submitted", "abc", "reditt", "by", "time", "timing", "comments", "comment", "thank", "link", "im", "thanks", "would", "like", "get", "good", "go", "may", "also", "going", "dont", "want", "see", "take", "looking", "" ]) remover.setStopWords(stop_words) df_stop_words_removed = remover.transform(df_tokenized_content).drop( "content_words") cv = CountVectorizer(inputCol="filtered_words", outputCol="rawFeatures") cv_model = cv.fit(df_stop_words_removed) df_tf_data = cv_model.transform(df_stop_words_removed) df_features = df_tf_data.select( df_tf_data.rawFeatures.alias("features")) def convert_term_indices_to_term(term_indices, vocab): terms = [] for t in term_indices: terms.append(vocab[t]) return str(terms) # LDA lda = LDA(k=5, maxIter=50, learningOffset=8192.0, learningDecay=0.50) model = lda.fit(df_features) df_topics = model.describeTopics() fn_term_indices_to_term = fn.udf(convert_term_indices_to_term) vocab_lit = fn.array(*[fn.lit(k) for k in cv_model.vocabulary]) df_lda_result = df_topics.withColumn( "terms", fn_term_indices_to_term("termIndices", vocab_lit)) df_lda_result.select("topic", "termIndices", "terms").show(truncate=False) df_lda_result.cache() lda_terms = df_lda_result.select("terms").collect() lda_terms_list = [str(i.terms) for i in lda_terms] # based on model terms choose news stories for term_list in lda_terms_list: s = [] topic_words = term_list[1:-1].split(",") for term in topic_words: term = term.split("'")[1] s.append(r"(^|\W)" + str(term) + r"($|\W)") rx = '|'.join('(?:{0})'.format(x.strip()) for x in s) df_results = df_news_content_ascii.filter( df_news_content_ascii['news_content_ascii'].rlike(rx)) df_results = df_results.withColumn("topic_words", fn.lit(str(topic_words)[1:-1])) df_results = df_results.withColumn("results_date", fn.lit(datetime.datetime.now())) # insert results with the raw data to the cassandra table df_results.select("id", "news_provider", "published", "summary", "title", "topic_words", "results_date") \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table="travel_news_data_results", keyspace="news_stream_analysis") \ .save(mode="append")
"tweet_text", F.trim(F.col("tweet_text")))) #============================================ # preprocessing #============================================ # 2.1. tokenize tokenizer = Tokenizer(inputCol="tweet_text", outputCol="tokens") # 2.2. remove stopwords stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="remove_stop") stopwords_list = stopword_remover.getStopWords() stopwords_list = stopwords_list + more_stopwords stopword_remover.setStopWords(stopwords_list) #2.3. stemming # TODO: how to modify the stemming function into a transformer? stemmer = PorterStemmer() # more straightforward to use lambda stem_udf = F.udf(lambda l: [stemmer.stem(word) for word in l], returnType=ArrayType(StringType())) df_tokenized = tokenizer.transform(df_select_clean) df_rmstop = stopword_remover.transform(df_tokenized) df_stemmed = df_rmstop.withColumn("stemmed", stem_udf(F.col("remove_stop"))) # Load the trained LDAmodel savedPipelineModel = PipelineModel.load(pipelinePath)