コード例 #1
0
# Load from Kinesis Stream
rawData = spark\
    .readStream\
    .format("kinesis")\
    .option("streamName", "tech-trends-stream")\
    .option("endpointUrl", "https://kinesis.eu-west-2.amazonaws.com")\
    .load()

tweetSchema = StructType() \
    .add("text", StringType()) \
    .add("hashtags", ArrayType(StringType())) \

# Extract JSON data from Kinesis message
tweets = rawData \
    .selectExpr("cast (data as STRING) jsonData") \
    .select(from_json("jsonData", tweetSchema).alias("tweets")) \
    .select('tweets.text')

# Load Pipeline and Transform for Sentiment
pipeline = PretrainedPipeline("analyze_sentiment", lang="en")
sentiments = pipeline.transform(tweets)

result = sentiments.select('text', 'sentiment')

# Write to JSON in S3
query = sentiments.writeStream\
    .format("json")\
    .option("path", "s3a://tech-trends-output/sentiments")\
    .option("checkpointLocation", "s3a://tech-trends-output/sentiments/checkpoint")\
    .start()
コード例 #2
0

udf_ascii_user_text = udf(ascii_user_text, StringType())
commentsCleaned = commentsCleaned.withColumn('text',
                                             udf_ascii_user_text(col('text')))

commentsCleaned.show()

# TODO: reddit comments can be written in markdown, should we convert the markdown to plain text before analyzing?

# TODO: our pipeline is trained for the english language. find a way to filter out comments that are not in english?

# TODO: fix that weird encoding error https://stackoverflow.com/questions/39662384/pyspark-unicodeencodeerror-ascii-codec-cant-encode-character

# 3. Perform Analysis in Parallel
result = pipeline.transform(commentsCleaned)
result = result.select('subreddit', 'sentiment')


# UDF (user defined function) to get sentiment summary from the full sentiment Array (returns 1, 0, or -1)
def sentiment_sum(in_array):
    if in_array != None and len(in_array) != 0:
        if len(in_array) == 0:
            # only one sentance exists, so take result from first sentance
            result = in_array[0]['result']

            if result == 'na':
                return None
            elif result == 'positive':
                return 1
            else:
コード例 #3
0
def _clean_sent_pipeline (data_ip,input_col, import_c=True):
  print(f"\t\t\t---- Starting the pipeline built for >>> {input_col} <<< with import condition {import_c} ----")
  from pyspark.sql import functions as F
  data=data_ip
  from pyspark.sql.types import IntegerType
  data= data.withColumn("_c0", data["_c0"].cast(IntegerType()))
  text_col = input_col
  non_null_index = (data.filter(data[text_col].isNotNull())).select('_c0')

  text_clean = data.select(text_col).filter(F.col(text_col).isNotNull())
  print(f"\n\t1. Cleaning the input for Null {data.count()} to {data.count()-non_null_index.count()}")

  if import_c: from sparknlp.base import DocumentAssembler
  documentAssembler = sparknlp.base.DocumentAssembler().setInputCol(text_col).setOutputCol('document')
  print(f"\n\t2. Attaching DocumentAssembler Transformer to the pipeline")

  if import_c: from sparknlp.annotator import Tokenizer
  tokenizer = sparknlp.annotator.Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
  print(f"\n\t3. Attaching Tokenizer Annotator to the pipeline")

  if import_c: from sparknlp.annotator import Normalizer
  normalizer = sparknlp.annotator.Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True)
  print(f"\n\t4. Attaching Normalizer Annotator to the pipeline")

  if import_c: from sparknlp.annotator import LemmatizerModel
  lemmatizer = sparknlp.annotator.LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
  print(f"\n\t5. Attaching LemmatizerModel Annotator to the pipeline")

  if import_c: 
    import nltk
    nltk.download("popular")
  from nltk.corpus import stopwords
  eng_stopwords = stopwords.words('english')
  print(f"\n\t6. nltk stop-words found")

  if import_c: from sparknlp.annotator import StopWordsCleaner
  stopwords_cleaner = sparknlp.annotator.StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords)
  print(f"\n\t7. Attaching StopWordsCleaner Annotator to the pipeline")

  if import_c: from sparknlp.annotator import NGramGenerator
  ngrammer = sparknlp.annotator.NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')
  print(f"\n\t8. Attaching NGramGenerator Annotator to the pipeline")
  

  if import_c: from sparknlp.annotator import PerceptronModel
  pos_tagger = sparknlp.annotator.PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos')
  print(f"\n\t9. Attaching PerceptronModel Annotator to the pipeline")

  if import_c: from sparknlp.base import Finisher
  finisher = sparknlp.base.Finisher().setInputCols(['unigrams', 'ngrams','pos'])
  print(f"\n\t10. Attaching Finisher Transformer to the pipeline")

  from pyspark.ml import Pipeline
  pipeline = Pipeline().setStages([documentAssembler,
                                  tokenizer,
                                  normalizer,
                                  lemmatizer,
                                  stopwords_cleaner,
                                  pos_tagger,
                                  ngrammer,
                                  finisher])
  print("\n\t\t\t ---- Pipeline Built Successfully ----")

  processed_tweets = pipeline.fit(text_clean).transform(text_clean)
  print("\n\t\t\t ---- Pipeline Fitted Successfully ----")

  from pyspark.sql.functions import concat
  processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams')))
  print("\n\tData Concatination done - uni--ngrams")

  print("\n\t\t\t ---- Loading the Pre-trained Pipeline  analyze_sentimentdl_use_twitter----")

  from sparknlp.pretrained import PretrainedPipeline
  pipeline_sent = PretrainedPipeline("analyze_sentimentdl_use_twitter", lang="en")

  pipout_sent_results = pipeline_sent.transform(processed_tweets.withColumnRenamed(text_col, "text"))

  print("\n\t\t\t ---- Sentiments Fetched Successfully ----\n\n\n")

  from pyspark.sql.functions import col
  from pyspark.sql.functions import monotonically_increasing_id, row_number
  from pyspark.sql.window import Window
  pipout_sent_results=pipout_sent_results.withColumn("id_tmp",row_number().over(Window.orderBy(monotonically_increasing_id())))
  non_null_index=non_null_index.withColumn("id_tmp",row_number().over(Window.orderBy(monotonically_increasing_id())))

  print("\n$$$ Indexing done for the Compiled Result")

  data_op=data.join(non_null_index.join(pipout_sent_results, on=["id_tmp"]).drop("id_tmp"), on=["_c0"], how='left_outer')
  data_op=data_op.withColumn("_c0", data_op["_c0"].cast(IntegerType()))

  print("\n$$$ Joining the final resutls with original dataframe") #f**k<<catch this

  print(f"\nOriginal IP={data.count()} \nNonNull Index={non_null_index.count()} \nNull_Clean={text_clean.count()} \nOriginal OP={data_op.count()}")
  print(data.show(4))
  #print("\t\t\t\t\t CONVERTED TO THIS")
  final_results = data_op.orderBy("_c0")
  print("\n$$$ Spark Created")


  id = list((((final_results.select('str_id')).toPandas())).str_id)
  createdat = list((((final_results.select('created_at')).toPandas())).created_at)
  fulltext = list((((final_results.select('full_text')).toPandas())).full_text)
  favoritecount = list((((final_results.select('favorite_count')).toPandas())).favorite_count)
  retweetcount = list((((final_results.select('retweet_count')).toPandas())).retweet_count)
  pipeclean = list((((final_results.select('text')).toPandas())).text)
  textlen = list(((final_results.select('finished_unigrams')).toPandas()).finished_unigrams.apply(lambda row: int(len(row))))
  sentscores = list(((final_results.select('sentiment')).toPandas()).sentiment.apply(lambda row: (((str(row)).split(",")[3]).split("'")[1])))
  op_df = p.DataFrame(list(zip(id,createdat,fulltext,favoritecount,retweetcount,pipeclean,textlen,sentscores)), columns = ['str_id','created_at','text_full','favorite_count','retweet_count','text_pipe_clean','text_length','sentiment_score'])

  print("\n$$$ Pandas Created")
  print(op_df.head(4))

  return op_df