Example #1
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
Example #2
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input",
                                        outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], str))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame(
         [Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
Example #3
0
    def transform(self):
        df2 = self.dataframe.withColumn(
            "_2",
            regexp_replace(col("_2"), "[\"'./§$&+,:;=?@#–|'<>.^*()%!-]", ""))
        df = df2.withColumn("_2", regexp_replace(col("_2"), "\\s{2,}", ""))

        language_detect = udf(lambda x: detect(x), returnType=StringType())
        df3 = df.withColumn("lang", language_detect('_2'))

        lemmatizer = Lemmatizer(lookup=delook)
        lemmatizer1 = Lemmatizer(lookup=enlook)
        tokenizer = Tokenizer(inputCol="_2", outputCol="words")
        tokenized = tokenizer.transform(df3)
        # print(tokenized)

        lemma = udf(lambda x, lang: True if lang == "de"
                    " ".join([lemmatizer.lookup(i) for i in x]) else " ".join(
                        [lemmatizer1.lookup(i) for i in x]),
                    returnType=StringType())

        lemmatized = tokenized.withColumn(
            "stemmed", lemma(col('words'),
                             col('lang'))).drop('words').drop('_2')
        tokenizer = Tokenizer(inputCol="stemmed", outputCol="words")
        tokenized = tokenizer.transform(lemmatized)
        remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        stopwords = remover.loadDefaultStopWords(
            "german") + remover.loadDefaultStopWords("english")
        remover = remover.setStopWords(stopwords)
        newDataSet = remover.transform(tokenized)

        test = newDataSet.withColumn("filtered", explode(col("filtered"))) \
            .groupBy("_1", "filtered") \
            .agg(func.count(func.lit(1)).alias("count")) \
            .sort(col("count").desc())

        return test
Example #4
0
    df = df.withColumn('lemmed_tokens', lemma_udf(df.tokens))

    # remove stopwords
    print('Removing Stop Words...')
    swr = StopWordsRemover(inputCol='lemmed_tokens',
                           outputCol='filtered_tokens')
    stops = swr.loadDefaultStopWords('english')
    for stop in stops:
        stop.replace('’', '')
    for word in [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
            'ha', 'wa', 'getty', 'image', 'ap', 'pictwittercom'
    ]:
        stops.append(word)
    swr.setStopWords(stops)
    df = swr.transform(df)

    df = df.select('post_id', 'filtered_tokens')

    print("Post Stop Word Remove")
    df.take(1)
    df.cache()

    #options for NLP here: word 2 vec to drop synonyms, n-grams, etc

    #tokens to counts
    print('Processing Through Count Vectorizer...')
    cv = CountVectorizer(inputCol='filtered_tokens',
                         outputCol='counts',
                         minDF=5,
    print(files_path_test)
    print("model will be save in \n", s3_bucket + pipelinePath)
    print(50*"=")

    df_train = spark.read.format("parquet").load(s3_bucket + files_path_train).repartition(4)
    df_test = spark.read.format("parquet").load(s3_bucket + files_path_test).repartition(4)

    

# ============================================
# extra step to remove frequent words
# ============================================

# one extra step to remove the frequent words
stopword_remover_stem = StopWordsRemover(inputCol="stemmed", outputCol="stemmed_rm")
stopword_remover_stem.setStopWords(extra_for_stemmed)

df_train = stopword_remover_stem.transform(df_train)
df_test = stopword_remover_stem.transform(df_test)

# ============================================
# cache and print some basic information
# ============================================
df_train.cache()
df_test.cache()
print("Train/test data info:")
print(50*"=")
print(f"nums of training data: {df_train.count(): 10d}")
print(f"nums of test data: {df_test.count(): 10d}")
print(50*"=")
Example #6
0
    '''expres = [split(col("sentence"), " ").alias("sentence")]
    sentenceDataFrame = sentenceDataFrame.withColumn("sentence", *expres)
    remover = StopWordsRemover(inputCol="sentence", outputCol="filtered")
    swlist = remover.getStopWords()
    swlist.append("")
    remover.setStopWords(swlist)
    final = remover.transform(sentenceDataFrame.select("sentence"))'''

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    countTokens = udf(lambda words: len(words), IntegerType())

    tokenized = tokenizer.transform(sentenceDataFrame)
    print(tokenized.columns)
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    swlist = remover.getStopWords()
    swlist.append("")
    remover.setStopWords(swlist)
    tokenized = remover.transform(tokenized.select("words"))

    tokenized.select("sentence", "words")\
        .withColumn("tokens", countTokens(col("words"))).show()

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
    tf = hashingTF.transform(tokenized)
    tf.select('rawFeatures').take(2)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(tf)
    tfidf = idfModel.transform(tf)
    print(tfidf.select("features").first())
    spark.stop()
Example #7
0
df.show(10, False)

tokenizer = RegexTokenizer(inputCol="allTextString",
                           outputCol="word_tokens",
                           pattern="\\W")
TokenizerData = tokenizer.transform(df)
df = TokenizerData

remover = StopWordsRemover(inputCol="word_tokens", outputCol="stop_removed")
my_sw = [
    'united', 'states', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
]
sw = remover.loadDefaultStopWords("english")
remover.setStopWords(sw + my_sw)
StopWordsRemoverData = remover.transform(df)
df = StopWordsRemoverData

cv = CountVectorizer(inputCol="stop_removed",
                     outputCol="CountVectorizer",
                     vocabSize=1000,
                     minDF=1.0,
                     minTF=1.0)
transformer = cv.fit(df)
print(" ----------- ", transformer.vocabulary)
vacabulary = transformer.vocabulary
CountVectorizerData = transformer.transform(df)
df = CountVectorizerData

# Trains a LDA model.
from pyspark.sql.types import *

tokenizer = Tokenizer(inputCol="cluster_text", outputCol="words")

countTokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(documents)

# StopWordsRemover
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Add stopwords to the existing list.
add_stopwords = ["like", "-", "the", "to", "@", "get", "got", "i´m", "don´t"]
newStopwords = remover.getStopWords() + add_stopwords
remover.setStopWords(newStopwords)
remover.getStopWords()

# transform twitter text by removing stopwords
tokenized = remover.transform(tokenized)

# Explode and aggregate words
tokenized = tokenized.withColumn("word", F.explode('filtered'))

# Add counter column with 1 initiated.
tokenized = tokenized.withColumn("count", F.lit(1))

# Count words under the same prediction(cluster) group.
countedWords = tokenized.groupBy("prediction", "word")\
  .agg(F.count("count").alias("wordCount"))#.orderBy("wordCount", ascending=False).show()
Example #9
0
    sc = spark.sparkContext

    hadoop_conf = sc._jsc.hadoopConfiguration()
    hadoop_conf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
    hadoop_conf.set('fs.s3a.awsAccessKeyId', access_id)
    hadoop_conf.set('fs.s3a.awsSecretAccessKey', access_key)

    infilepath = 's3a://bkrull-insight-bucket/tokenize/{year}-{month:02d}'
    outfilepath = 's3a://bkrull-insight-bucket/results/{subreddit}/{week}'
    cols = ['subreddit', 'date', 'results']

    # Create Spark ML Pipeline stages ----------------------------------
    remover = StopWordsRemover(inputCol='body', outputCol='filtered')
    stopwords = remover.getStopWords()
    stopwords.extend(extra_stopwords)
    remover.setStopWords(stopwords).setCaseSensitive(True)

    cv = CountVectorizer(inputCol='filtered', outputCol='features', minDF=1.0)
    lda = LDA(k=5, maxIter=10, optimizer='online')
    pipeline = Pipeline(stages=[remover, cv, lda])
    # ------------------------------------------------------------------

    # Get and filter data ----------------------------------------------
    df = spark.read.json(infilepath.format(year=year, month=month))
    subreddit_df = df.filter(df.subreddit == subreddit)
    tokens = subreddit_df.filter(df.created_utc.between(lit(week), lit(week_end))) \
             .select('body')
    # ------------------------------------------------------------------
    num_docs = tokens.count()
    if num_docs >= 1:
        model = pipeline.fit(tokens)
Example #10
0
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, HashingTF, StopWordsRemover, RegexTokenizer
stopwords=list()
_mystopwords=[u"나",u"너", u"우리"]
for e in _mystopwords:
    stopwords.append(e)

labelIndexer = StringIndexer(inputCol="cls", outputCol="label")
regexTok = RegexTokenizer(inputCol="sent", outputCol="wordsRegex", pattern="\\s+")
#tokenizer = Tokenizer(inputCol="sent", outputCol="words")
stop = StopWordsRemover(inputCol="wordsRegex", outputCol="nostops")
_stopwords=stop.getStopWords()
for e in _stopwords:
    stopwords.append(e)
stop.setStopWords(stopwords)

hashingTF = HashingTF(inputCol="nostops", outputCol="features")
pipeline = Pipeline(stages=[labelIndexer,regexTok,stop,hashingTF])
model=pipeline.fit(df)
trainDf = model.transform(df)

trainDf.select('cls','label','features').show()

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors

trainRdd = trainDf\
    .rdd\
    .map(lambda row: LabeledPoint(row.label,Vectors.fromML(row.features)))
trainRdd.take(1)
def get_trending_news(rdd):
    if not rdd.isEmpty():
        spark = getSparkSessionInstance(rdd.context.getConf())

        df = spark.createDataFrame(rdd)

        # Append the title and summary together
        df_news_concat = df.withColumn("news_content",
                                       fn.concat_ws(" ", df.title, df.summary))

        df_punc_removed = df_news_concat.withColumn(
            "news_content_removed",
            fn.regexp_replace(df_news_concat.news_content, "\p{Punct}", ""))

        udf_remove_unicode = fn.udf(
            lambda x: x.encode("ascii", "ignore").decode("ascii"))
        df_news_content_ascii = df_punc_removed.withColumn(
            "news_content_ascii",
            udf_remove_unicode(df_punc_removed.news_content_removed))

        # insert raw data to the cassandra table
        df_news_content_ascii.select("id", "news_provider", "published", "summary", "title") \
            .write \
            .format("org.apache.spark.sql.cassandra") \
            .mode("append") \
            .options(table="travel_news_data", keyspace="news_stream_analysis") \
            .save(mode="append")

        tokenizer = Tokenizer(inputCol="news_content_ascii",
                              outputCol="content_words")
        df_tokenized_content = tokenizer.transform(df_news_content_ascii).drop(
            "news_content")

        remover = StopWordsRemover(inputCol="content_words",
                                   outputCol="filtered_words")
        stop_words = remover.loadDefaultStopWords("english")
        stop_words.extend([
            '', "travel", "trip", "submitted", "abc", "reditt", "by", "time",
            "timing", "comments", "comment", "thank", "link", "im", "thanks",
            "would", "like", "get", "good", "go", "may", "also", "going",
            "dont", "want", "see", "take", "looking", ""
        ])
        remover.setStopWords(stop_words)
        df_stop_words_removed = remover.transform(df_tokenized_content).drop(
            "content_words")

        cv = CountVectorizer(inputCol="filtered_words",
                             outputCol="rawFeatures")
        cv_model = cv.fit(df_stop_words_removed)
        df_tf_data = cv_model.transform(df_stop_words_removed)
        df_features = df_tf_data.select(
            df_tf_data.rawFeatures.alias("features"))

        def convert_term_indices_to_term(term_indices, vocab):
            terms = []
            for t in term_indices:
                terms.append(vocab[t])

            return str(terms)

        # LDA
        lda = LDA(k=5, maxIter=50, learningOffset=8192.0, learningDecay=0.50)
        model = lda.fit(df_features)
        df_topics = model.describeTopics()

        fn_term_indices_to_term = fn.udf(convert_term_indices_to_term)
        vocab_lit = fn.array(*[fn.lit(k) for k in cv_model.vocabulary])
        df_lda_result = df_topics.withColumn(
            "terms", fn_term_indices_to_term("termIndices", vocab_lit))
        df_lda_result.select("topic", "termIndices",
                             "terms").show(truncate=False)

        df_lda_result.cache()

        lda_terms = df_lda_result.select("terms").collect()
        lda_terms_list = [str(i.terms) for i in lda_terms]

        # based on model terms choose news stories
        for term_list in lda_terms_list:
            s = []
            topic_words = term_list[1:-1].split(",")
            for term in topic_words:
                term = term.split("'")[1]
                s.append(r"(^|\W)" + str(term) + r"($|\W)")
            rx = '|'.join('(?:{0})'.format(x.strip()) for x in s)
            df_results = df_news_content_ascii.filter(
                df_news_content_ascii['news_content_ascii'].rlike(rx))
            df_results = df_results.withColumn("topic_words",
                                               fn.lit(str(topic_words)[1:-1]))
            df_results = df_results.withColumn("results_date",
                                               fn.lit(datetime.datetime.now()))

            # insert results with the raw data to the cassandra table
            df_results.select("id", "news_provider", "published", "summary", "title", "topic_words", "results_date") \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table="travel_news_data_results", keyspace="news_stream_analysis") \
                .save(mode="append")
                                                "tweet_text",
                                                F.trim(F.col("tweet_text"))))

    #============================================
    # preprocessing
    #============================================
    # 2.1. tokenize
    tokenizer = Tokenizer(inputCol="tweet_text", outputCol="tokens")

    # 2.2. remove stopwords
    stopword_remover = StopWordsRemover(inputCol="tokens",
                                        outputCol="remove_stop")

    stopwords_list = stopword_remover.getStopWords()
    stopwords_list = stopwords_list + more_stopwords
    stopword_remover.setStopWords(stopwords_list)
    #2.3. stemming
    # TODO: how to modify the stemming function into a transformer?
    stemmer = PorterStemmer()
    # more straightforward to use lambda
    stem_udf = F.udf(lambda l: [stemmer.stem(word) for word in l],
                     returnType=ArrayType(StringType()))

    df_tokenized = tokenizer.transform(df_select_clean)
    df_rmstop = stopword_remover.transform(df_tokenized)
    df_stemmed = df_rmstop.withColumn("stemmed",
                                      stem_udf(F.col("remove_stop")))

    # Load the trained LDAmodel
    savedPipelineModel = PipelineModel.load(pipelinePath)