drop_list = [
    'Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y'
]

data = data.select(
    [column for column in data.columns if column not in drop_list])
data.show(5)
data.printSchema()

##################################################
################## Transofrmers ##################
##################################################

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="Descript",
                                outputCol="words",
                                pattern="\\W")
# stop words
add_stopwords = ["http", "https", "amp", "rt", "t", "c",
                 "the"]  # standard stop words
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered",
                               outputCol="features",
                               vocabSize=10000,
                               minDF=5)
label_stringIdx = StringIndexer(inputCol="Category", outputCol="label")
transformers = [
    regexTokenizer, stopwordsRemover, countVectors, label_stringIdx
]
sqlContext = SQLContext(sc)
df = pd.DataFrame(train_data)
# df = df.transpose()
df.columns = ['tweet_id', 'tweet_label', 'tweet_words']
data_complete = sqlContext.createDataFrame(df)
data = data_complete.select(['tweet_label', 'tweet_words'])
data.show(5)

data.groupBy("tweet_label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="tweet_words",
                                outputCol="words",
                                pattern="\\W")

# stop words
f = open("stopwords_twitter.txt", "r")
add_stopwords = []
for l in f.readlines():
    add_stopwords.append(l.strip())
print(add_stopwords)

stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
# countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
hashingTF = HashingTF(inputCol="filtered",
Ejemplo n.º 3
0
    def __init__(self):
        # Convert Pandas dataframe to PySpark dataframe.
        df = sqlContext.read.format("csv").option("header", "true").load("hotel-reviews.csv")
        # df = sqlContext.createDataFrame(pandas_df)

        # Change Reviewer_Score in Sentiment value (1 <= 5.5, 0 < 5.5)
        df = df.withColumn('Reviewer_Score', fn.when(df.Reviewer_Score >= 7.0, 1).otherwise(0))
        df = df.withColumnRenamed('Reviewer_Score', 'Sentiment')

        # Concatenate the negative and positive to a single review text
        df_with_text = df.withColumn('Review_Text',
                                     fn.concat(fn.col('Negative_Review'), fn.lit(' '), fn.col('Positive_Review')))

        # Strip Dataframe to only what is necessary for sentiment analysis
        df_stripped = df_with_text.select('Negative_Review', 'Positive_Review', 'Review_Text', 'Sentiment')

        # Importing Stopwords to filter out of the reviews to exclude stopwords
        stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()

        # Configure tokenizer to extract words with only letters and save in column words
        tokenizer = RegexTokenizer().setGaps(False) \
            .setPattern("\\p{L}+") \
            .setInputCol("Review_Text") \
            .setOutputCol("words")

        # Configure stopwords filter
        sw_filter = StopWordsRemover() \
            .setStopWords(stop_words) \
            .setCaseSensitive(False) \
            .setInputCol("words") \
            .setOutputCol("filtered")

        cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2 ** 17) \
            .setInputCol("filtered") \
            .setOutputCol("tf")

        # Create Pipeline with Tokenizer, Stopwords Filter and CountVectorizer
        cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(df_stripped)

        # Configure TFIDF
        idf = IDF(). \
            setInputCol('tf'). \
            setOutputCol('tfidf')

        idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(df_stripped)

        # Split data into training, validation and testing data (60%, 30%, 10%)
        training_df, validation_df, testing_df = df_stripped.randomSplit([0.6, 0.3, 0.1], seed=0)

        # Configure LogisticRegression for analysis of the reviews
        lr = LogisticRegression(). \
            setLabelCol('Sentiment'). \
            setFeaturesCol('tfidf'). \
            setRegParam(0.0). \
            setMaxIter(100). \
            setElasticNetParam(0.)

        # Create new Pipelines for the LogisticRegression and train the model
        self.model = Pipeline(stages=[idf_pipeline, lr]).fit(training_df)

        # Calculate Score of our Model using the validation Dataframe
        self.model.transform(validation_df). \
            select(fn.expr('float(prediction = Sentiment)').alias('correct')). \
            select(fn.avg('correct')).show()

        spark = SparkSession \
            .builder \
            .appName("user_input_analysis") \
            .getOrCreate()
Ejemplo n.º 4
0
def main(review_table,business_table,output_folder):


    #Read reviews and business data
    review_df = spark.read.parquet(review_table)
    review_df.createOrReplaceTempView("reviews_table")

    business_df = spark.read.parquet(business_table)
    business_toronto=business_df.filter(business_df.City=="Toronto")
    business_toronto.createOrReplaceTempView("business_table")

    #collect reviews for each business
    business_review=spark.sql( """ SELECT BusinessID, collect_set(Review) AS total_review FROM reviews_table GROUP BY BusinessID """ )

    #convert reviews in string format
    merge_review = udf(lambda total_review: (" ").join(total_review))
    business_concat_review=business_review.withColumn("comb_review", merge_review(business_review['total_review'])).drop(business_review['total_review'])
    business_concat_review.createOrReplaceTempView("comb_review_table")

    #Keep reviews for business in toronto
    Reviews_for_business=spark.sql(""" SELECT c.BusinessID,b.Name AS BusinessName,b.BusinessStars,c.comb_review FROM comb_review_table AS c INNER JOIN business_table AS b ON c.BusinessID=b.BusinessID """)

    #pipleine to preprocess text data
    regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'comb_review', outputCol = 'token')
    stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'no_stopword')
    countVectorizer = CountVectorizer(inputCol="no_stopword", outputCol="rawcol")
    TDF = IDF(inputCol="rawcol", outputCol="idf_vec")
    text_pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, TDF])

    IDF_model = text_pipeline.fit(Reviews_for_business)
    #IDF_model.write().overwrite().save('IDF_model1')

    #collect the vacabulary from text  from count vectorizer model
    vocab=IDF_model.stages[2].vocabulary

    business_review_df=IDF_model.transform(Reviews_for_business)

    #two business categories base on low and high star rating
    reviews_low=business_review_df.where(business_review_df.BusinessStars<=3)
    reviews_high=business_review_df.where(business_review_df.BusinessStars>3)

    lda = LDA(k=6, seed=123, optimizer='online', featuresCol="idf_vec")
    vocab_word = udf(lambda termIndices: [vocab[idx] for idx in termIndices])

    #topic modelling on low rating business
    lowtopic_model = lda.fit(reviews_low)
    lowtopic_transform=lowtopic_model.transform(reviews_low)
    print("topic distribution for low rating business")
    lowtopic_transform.select('BusinessID','BusinessName','topicDistribution').show(4,False)
    #lowtopic_model.write().overwrite().save('lowtopic_model')
    
    #topic distribution
    low_dist=lowtopic_transform.withColumn('topic_distribution',lowtopic_transform['topicDistribution'].cast('string')).drop('topicDistribution')
    low_dist_df=low_dist.select('BusinessID','BusinessName','topic_distribution')    
    low_dist_df.write.csv(output_folder + '/Topic_low_business_topic_dist',header=True)
    
    #key topics
    lowreview_topics=lowtopic_model.describeTopics() 
    lowreview_topics_concat=lowreview_topics.withColumn("topic_word", vocab_word(lowreview_topics['termIndices']))
    
    
    low_df=lowreview_topics_concat.select('topic','topic_word')
    print("Topics for low rating business")
    low_df.show(6,False)
    low_df.coalesce(1).write.csv(output_folder + '/Topic_low_rating_topic',header=True)

    
    #topic modelling on high rating business
    high_topic_model = lda.fit(reviews_high)
    hightopic_transform=high_topic_model.transform(reviews_high)
    print("topic distribution for high rating business")
    hightopic_transform.select('BusinessID','BusinessName','topicDistribution').show(4,False)
    #high_topic_model.write().overwrite().save('high_topic_model')
    
    #topic distribution
    high_dist=hightopic_transform.withColumn('topic_distribution',hightopic_transform['topicDistribution'].cast('string')).drop('topicDistribution')
    high_dist_df=high_dist.select('BusinessID','BusinessName','topic_distribution')
    high_dist_df.write.csv(output_folder + '/Topic_high_business_topic_dist',header=True)

    #key topic 
    highreview_topics=high_topic_model.describeTopics()
    highreview_topics_concat=highreview_topics.withColumn("topic_word", vocab_word(highreview_topics['termIndices']))
    high_df=highreview_topics_concat.select('topic','topic_word')
    
    print("\nTopics for high rating business")
    high_df.show(6,False)
    high_df.coalesce(1).write.csv(output_folder + '/Topic_high_rating_topic',header=True)
#createDataFrame!
missing_count = spark.createDataFrame(null_value_count(news_data), ['Coulmn_with_Null_Value', 'Null_values_count']).show()
#
title_category = news_data.select('TITLE', 'CATEGORY')

title_category.select('Category').distinct().count()
title_category.groupBy('Category').count().orderBy(col('Count').desc()).show(truncate = False)
title_category.groupBy('TITLE').count().orderBy(col('count').desc()).show(truncate = False)
####
#Top 20 news categories:
#regexp_replace: regular expression replacing!
title_category = title_category.withColumn('only_str', regexp_replace(col('TITLE'), '\d+', ''))
title_category.select('TITLE', 'only_str').show(truncate = False)

#Top 20 news title:
regex_tokenizer = RegexTokenizer(inputCol = 'only_str', outputCol = 'words', pattern = '\\W')
raw_words = regex_tokenizer.transform(title_category)
raw_words.show()


remover = StopWordsRemover(inputCol = 'words', outputCol = 'filtered')
word_df = remover.transform(raw_words)
word_df.select('words', 'filtered').show(truncate = False)
indexer = StringIndexer(inputCol = 'CATEGORY', outputCol = 'categoryIndex')
feature_data = indexer.fit(word_df).transform(word_df)
feature_data.show()


cv = CountVectorizer(inputCol = 'filtered', outputCol = 'features')

Ejemplo n.º 6
0
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("TokenizerExample").getOrCreate()

    # $example on$
    sentenceDataFrame = spark.createDataFrame(
        [(0, "Hi I heard about Spark"),
         (1, "I wish Java 12 2 could use case classes"),
         (2, "Logistic,regression,models,are,neat")], ["id", "sentence"])

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    regexTokenizer = RegexTokenizer(inputCol="sentence",
                                    outputCol="words",
                                    pattern="\\W")
    # alternatively, pattern="\\w+", gaps(False)

    countTokens = udf(lambda words: len(words), IntegerType())

    tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("sentence", "words")\
        .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

    regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("sentence", "words") \
        .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
    print("Apa yang disini")
    # $example off$
Ejemplo n.º 7
0
def train_model():
  '''
  if(dataRdd != None):
    print("**************************************************************************************************** Inside train model with new rdd")
    # Read the model
    pipeModel_Prev = PipelineModel.load('sentiment.model')
    
    # regular expression tokenizer
    regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W")

    # bag of words count
    countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5)

    # convert string labels to indexes
    label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

    nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial")

    # convert prediction to the predictedSentiment
    indexToLabels = IndexToString(inputCol = "prediction", outputCol = "predictedSentiment", labels=["bordem","love","relief", "fun", "hate", "neutral", "anger", "happiness", "surpirse","sadness","worry", "empty"])

    # Buidl spark pipeline
    pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx, nb, indexToLabels])

    # Fit the pipelin.
    pipeModel_Next = pipeline.fit(dataRDD)
    pipe_model_new = PipelineModel(stages = [pipeModel_Prev ,pipeModel_Next])
    print("Workinggggggggggggggg")
    pipeModel_New.save("sentiment.model")
  '''
  data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('text_emotion.csv')
  #Drop unused columns
  drop_list = ['tweet_id']
  data = data.select([column for column in data.columns if column not in drop_list]) \
             .where(
                    (data['sentiment'] == 'empty') |
                    (data['sentiment'] == 'sadness') |
                    (data['sentiment'] == 'enthusiam') |
                    (data['sentiment'] == 'worry') |
                    (data['sentiment'] == 'surprise') |
                    (data['sentiment'] == 'love') |
                    (data['sentiment'] == 'hate') |
                    (data['sentiment'] == 'anger') |
                    (data['sentiment'] == 'neutral') |
                    (data['sentiment'] == 'relief') |
                    (data['sentiment'] == 'boredom') |
                    (data['sentiment'] == 'fun') |
                    (data['sentiment'] == 'happiness')) \
             .na.drop(thresh=3)

  data.show(5)

  data.groupBy("sentiment") \
      .count() \
      .orderBy(col("count").desc()) \
      .show()

  # set seed for reproducibility
  (trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 100)
  print("Training Dataset Count: " + str(trainingData.count()))
  print("Test Dataset Count: " + str(testData.count()))

  # regular expression tokenizer
  regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W")

  # bag of words count
  countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5)

  # convert string labels to indexes
  label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

  nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial")

  # convert prediction to the predictedSentiment
  indexToLabels = IndexToString(inputCol = "prediction", outputCol = "predictedSentiment", labels=["bordem","love","relief", "fun", "hate", "neutral", "anger", "happiness", "surpirse","sadness","worry", "empty"])

  # Buidl spark pipeline
  pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx, nb, indexToLabels])

  # Fit the pipelin.
  pipelineFit = pipeline.fit(trainingData)
  predictions = pipelineFit.transform(testData)

  predictions.filter(predictions['prediction'] == 0) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 1) \
      .select("content","sentiment", "predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 2) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 3) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 4) \
      .select("content","sentiment","predictedSentiment", "probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 5) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 6) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 7) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 8) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 9) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 10) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 11) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)


  # Retrive F1 accuracy score
  evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label")
  print("F1: %g" % (evaluator.evaluate(predictions)))
  pipelineFit.save("sentiment.model")
Ejemplo n.º 8
0
p_train = pd.DataFrame({
    'data': train.data,
    'target': train.target,
    'filenames': train.filenames
})
p_test = pd.DataFrame({
    'data': test.data,
    'target': test.target,
    'filenames': test.filenames
})

s_train = spark.createDataFrame(p_train)
s_test = spark.createDataFrame(p_test)

tokenizer = RegexTokenizer(inputCol='data', outputCol='words', pattern='\\W')
termFreq = HashingTF(inputCol='words', outputCol='freq')

pipeline = Pipeline(stages=[tokenizer, termFreq])
model = pipeline.fit(s_train)
data = model.transform(s_train)


def v_max(vector):
    return max(vector.toArray())


udf_v_max = udf(v_max, FloatType())
slen = udf(lambda s: s[0], IntegerType())
data.select(data.freq).rdd.map(lambda x: x.freq.toArray().argmax()).first()
data.first()
Ejemplo n.º 9
0
wcss = model.computeCost(data_scaled)
centers = model.clusterCenters()

result = model.transform(data_scaled)
result.show()
result.groupBy('prediction').count().show()

#####################################################################
############# Natural Language Preprocessing
from pyspark.sql.functions import regexp_replace
data_cleaned = data.withColumn('text', regexp_replace(data.text, '[0-9]', ''))

from pyspark.ml.feature import Tokenizer, RegexTokenizer
tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token')
tokenizer2 = RegexTokenizer(inputCol = 'text', outputCol = 'token', pattern = '#\w+')

data_token = tokenizer.transform(data)
data_token.show()

# n_words
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
count_token = udf(lambda token: len(token), IntegerType())
data_token.withColumn('n_token', count_token(col('token'))).show()

# stopwords
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol = 'token', outputCol = 'filtered', stopwords = ['aaa'])
data_2 = remover.transform(data_token)
Ejemplo n.º 10
0
if __name__ == "__main__":

    spark = SparkSession \
        .builder \
        .getOrCreate()

    # Prepare data
    data = spark.read.csv("hdfs://devenv/user/spark/spark_mllib_101/spam_detection/data/sms_messages_with_labels.csv",
                          inferSchema=True,
                          header=True)


    # Preprocessing and feature engineering
    feature_prep = data.select(lower(data["message"]).alias("message"), length(data["message"]).alias("length"), "label")

    feature_prep = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W+").transform(feature_prep)

    feature_prep = StopWordsRemover(inputCol='words',outputCol='stop_words_removed').transform(feature_prep)

    feature_prep = HashingTF(inputCol="stop_words_removed", outputCol="hashing_tf", numFeatures=4000).transform(feature_prep)

    feature_prep = IDF(inputCol="hashing_tf", outputCol="tf_idf").fit(feature_prep).transform(feature_prep)

    feature_prep = StringIndexer(inputCol='label',outputCol='label_indexed').fit(feature_prep).transform(feature_prep)

    feature_prep = VectorAssembler(inputCols=["tf_idf", "length"],
                           outputCol="features").transform(feature_prep)

    final_data = feature_prep.select("label_indexed", "features")

Ejemplo n.º 11
0
df.groupBy(df.term).agg(F.avg(df.fracNumPmts)).show()
# +----+------------------+                                                       
# |term|  avg(fracNumPmts)|
# +----+------------------+
# |  60|0.5334839555374444| #repaid very early
# |  36|0.7324283072750936|
# +----+------------------+


#start with text processing (most likely it has no significant impact)
df = df.withColumn('desc', F.regexp_replace('desc', '(Borrower added on [0-9][0-9]/[0-9][0-9]/[0-9][0-9] >)|<br>|<br/>' , '').alias('desc'))
#take a look to verify
#df.select('desc').show(3,truncate=False)

#split the doc strings, or use tokenizer
regexTokenizer = RegexTokenizer(inputCol="desc", outputCol="words", pattern="\\W")	
df = regexTokenizer.transform(df)

#3D vector space
word2Vec = Word2Vec(vectorSize=10, minCount=0, inputCol="words", outputCol="result")
#Fit to find word embeddings
modelW2V = word2Vec.fit(df)

#Use the embeddings to transform, with the vector for "words" in the column "result" 
df = modelW2V.transform(df)
#rows without any comments NULL -> marked 'none' will share the same vector
#df.select('desc','result').show(10,truncate=True) # set to false for large vector space

#cluster the result
kmeans = KMeans(k=3, seed=1, featuresCol="result", predictionCol="pred_KM")
modelKM = kmeans.fit(df)
Ejemplo n.º 12
0
def main(topic):
    # 1. Load Data, Combine keywords, tweet_urls by news_url, Add id
    messages = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', 'localhost:9092') \
        .option('subscribe', topic)\
        .option('failOnDataLoss', 'false')\
        .option('auto.offset.reset', 'earliest')\
        .load()
    values = messages.select(messages['value'].cast('string'))
    words = values.select(
        functions.explode(functions.split(values.value, ';')).alias("words"))
    data = words.withColumn('text', functions.split('words',
                                                    ',')).select('text')
    data = data.withColumn('news_id', data['text'][0])
    data = data.withColumn('news_url', data['text'][1])
    print('finish load data')

    # 2. Scrap the news_text and tweets_comments
    data = data.withColumn('news_info', udf_get_news_info(data['news_url']))
    data = data.withColumn('news_title', data['news_info'][0])
    data = data.withColumn('news_text', data['news_info'][1])
    data = data.withColumn('news_image', data['news_info'][2])
    data = data.where(data['news_title'].isNotNull()
                      & (functions.length(data['news_title']) > 0))
    data = data.where(data['news_text'].isNotNull()
                      & (functions.length(data['news_text']) > 0))
    # data = data.where(data['tweets_comment'].isNotNull() & (functions.length(data['tweets_comment']) > 0)) # filter reviews with no text
    print('finish scrap')

    # 3. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words
    data = data.withColumn('sentiment_scores',
                           udf_sentiment_score(data['news_text']))
    news_regex_tokenizer = RegexTokenizer(inputCol='news_text',
                                          outputCol='news_words',
                                          pattern='[^A-Za-z]+')
    news_stopwords_remover = StopWordsRemover(
        inputCol='news_words',
        outputCol='news_tokens',
        stopWords=StopWordsRemover.loadDefaultStopWords('english'))
    # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features')
    nlp_pipeline = Pipeline(
        stages=[news_regex_tokenizer, news_stopwords_remover])
    model = nlp_pipeline.fit(data)
    nlp_data = model.transform(data).select('news_id', 'news_title',
                                            'news_text', 'news_image',
                                            'news_tokens', 'sentiment_scores')

    # 4. Select Features
    nlp_data = nlp_data.withColumn('news_tokens',
                                   udf_morphy(nlp_data['news_tokens']))
    # nlp_data = nlp_data.withColumn('tweets_tokens', udf_morphy(nlp_data['tweets_tokens']))
    # nlp_data = nlp_data.select(nlp_data['business_id'], review['stars'], udf_morphy(review['tokens']).alias('tokens'))
    nlp_data = nlp_data.where(functions.size(nlp_data['news_tokens']) > 0)
    # nlp_data = nlp_data.where(functions.size(nlp_data['tweets_tokens']) > 0)
    # nlp_data_score = nlp_data_score.withColumn('tweets_tokens', functions.split('tweets_tokens', '\s+'))
    nlp_data = nlp_data.withColumn('news_tokens',
                                   functions.concat_ws(' ', 'news_tokens'))
    print('finish scores')

    # 5. Save
    nlp_data = nlp_data.withColumn(
        'dl_value',
        functions.to_json(
            functions.struct([nlp_data[x] for x in nlp_data.columns])))

    stream = nlp_data.select(nlp_data.news_id.alias("key"),
                             nlp_data.dl_value.alias("value"))\
        .writeStream\
        .format('kafka')\
        .outputMode('update')\
        .option('kafka.bootstrap.servers', 'localhost:9092')\
        .option("topic", "mlnews-2")\
        .option("checkpointLocation", "../check")\
        .start()

    # stream = nlp_data.writeStream.format('console').outputMode('update').start()
    stream.awaitTermination()
Ejemplo n.º 13
0
    "id",
    row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)
train_df.createOrReplaceTempView("train_df")
train_df.show(5)

test_df = test_df.withColumn(
    "id",
    row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)
test_df.createOrReplaceTempView("test_df")
test_df.show(5)

########################################################################################################
# Build pipeline and run
indexer = StringIndexer(inputCol="category", outputCol="label")
tokenizer = RegexTokenizer(pattern=u'\W+',
                           inputCol="text",
                           outputCol="words",
                           toLowercase=False)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
lr = LogisticRegression(maxIter=20, regParam=0.001)

# Builing model pipeline
pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr])

# Train model on training set
model = pipeline.fit(
    train_df
)  #if you give new names to your indexed datasets, make sure to make adjustments here

# Model prediction on test set
pred = model.transform(test_df)  # ...and here
Ejemplo n.º 14
0
#
df3 = df2.filter(df2.message.contains('\"Url\":\"https://isl-ca.dazn.com/misl/v2/Playback'))\
    .filter(df2.message.contains('&Format=MPEG-DASH&'))\
    .filter(df2.message.contains('\"User-Agent\":\"Mozilla/5.0,(Macintosh; Intel Mac OS X 10_12_6),AppleWebKit/605.1.15,(KHTML, like Gecko),Version/11.1.2,Safari/605.1.15\"},'))\
    .filter(df2.message.contains(',\"Response\":{\"StatusCode\":200,\"ReasonPhrase\":\"OK\",'))
df3.printSchema()
df4 = df3.withColumn(
    "messagecut",
    expr(
        "substring(message, locate('|Livesport.WebApi.Controllers.Playback.PlaybackV2Controller|',message)+60 , length(message)-1)"
    ))
#
#val regexTokenizer = new RegexTokenizer().setInputCol("messagecut").setOutputCol("words").setPattern("\\w+|").setGaps(false)
regexTokenizer = RegexTokenizer(minTokenLength=1,
                                gaps=False,
                                pattern='\\w+|',
                                inputCol="messagecut",
                                outputCol="words",
                                toLowercase=True)
#
tokenized = regexTokenizer.transform(df4)
tokenized.printSchema()
tokenized.coalesce(1).write.json(output_file1)
#
df5 = sqlContext.read.json(input_file2).filter("message IS NOT NULL")
#
ngram = NGram(n=90, inputCol="words", outputCol="ngrams")
#
ngramDataFrame = ngram.transform(df5)
ngramDataFrame.select("ngrams").coalesce(1).write.json(output_file2)
#
#
Ejemplo n.º 15
0
    ])

    data_df = spark.read.csv(file_path,
                             header=True,
                             schema=schema,
                             mode="DROPMALFORMED")

    splits = data_df.randomSplit([0.8, 0.2], 4)

    training = splits[0]
    test = splits[1]

    #-------------------------------------------------------------------------------------------------------------------

    tokenizer_svm = RegexTokenizer(inputCol="tweet",
                                   outputCol="words",
                                   pattern="\\s+")

    hashing_tf_svm = HashingTF(inputCol="words", outputCol="tf")

    idf_svm = IDF(inputCol="tf", outputCol="features")

    svm = LinearSVC()

    ovr = OneVsRest(classifier=svm)

    pipeline_svm = Pipeline(
        stages=[tokenizer_svm, hashing_tf_svm, idf_svm, ovr])

    model_svm = pipeline_svm.fit(training)
    result_svm = model_svm.transform(test)