# Load from Kinesis Stream rawData = spark\ .readStream\ .format("kinesis")\ .option("streamName", "tech-trends-stream")\ .option("endpointUrl", "https://kinesis.eu-west-2.amazonaws.com")\ .load() tweetSchema = StructType() \ .add("text", StringType()) \ .add("hashtags", ArrayType(StringType())) \ # Extract JSON data from Kinesis message tweets = rawData \ .selectExpr("cast (data as STRING) jsonData") \ .select(from_json("jsonData", tweetSchema).alias("tweets")) \ .select('tweets.text') # Load Pipeline and Transform for Sentiment pipeline = PretrainedPipeline("analyze_sentiment", lang="en") sentiments = pipeline.transform(tweets) result = sentiments.select('text', 'sentiment') # Write to JSON in S3 query = sentiments.writeStream\ .format("json")\ .option("path", "s3a://tech-trends-output/sentiments")\ .option("checkpointLocation", "s3a://tech-trends-output/sentiments/checkpoint")\ .start()
udf_ascii_user_text = udf(ascii_user_text, StringType()) commentsCleaned = commentsCleaned.withColumn('text', udf_ascii_user_text(col('text'))) commentsCleaned.show() # TODO: reddit comments can be written in markdown, should we convert the markdown to plain text before analyzing? # TODO: our pipeline is trained for the english language. find a way to filter out comments that are not in english? # TODO: fix that weird encoding error https://stackoverflow.com/questions/39662384/pyspark-unicodeencodeerror-ascii-codec-cant-encode-character # 3. Perform Analysis in Parallel result = pipeline.transform(commentsCleaned) result = result.select('subreddit', 'sentiment') # UDF (user defined function) to get sentiment summary from the full sentiment Array (returns 1, 0, or -1) def sentiment_sum(in_array): if in_array != None and len(in_array) != 0: if len(in_array) == 0: # only one sentance exists, so take result from first sentance result = in_array[0]['result'] if result == 'na': return None elif result == 'positive': return 1 else:
def _clean_sent_pipeline (data_ip,input_col, import_c=True): print(f"\t\t\t---- Starting the pipeline built for >>> {input_col} <<< with import condition {import_c} ----") from pyspark.sql import functions as F data=data_ip from pyspark.sql.types import IntegerType data= data.withColumn("_c0", data["_c0"].cast(IntegerType())) text_col = input_col non_null_index = (data.filter(data[text_col].isNotNull())).select('_c0') text_clean = data.select(text_col).filter(F.col(text_col).isNotNull()) print(f"\n\t1. Cleaning the input for Null {data.count()} to {data.count()-non_null_index.count()}") if import_c: from sparknlp.base import DocumentAssembler documentAssembler = sparknlp.base.DocumentAssembler().setInputCol(text_col).setOutputCol('document') print(f"\n\t2. Attaching DocumentAssembler Transformer to the pipeline") if import_c: from sparknlp.annotator import Tokenizer tokenizer = sparknlp.annotator.Tokenizer().setInputCols(['document']).setOutputCol('tokenized') print(f"\n\t3. Attaching Tokenizer Annotator to the pipeline") if import_c: from sparknlp.annotator import Normalizer normalizer = sparknlp.annotator.Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True) print(f"\n\t4. Attaching Normalizer Annotator to the pipeline") if import_c: from sparknlp.annotator import LemmatizerModel lemmatizer = sparknlp.annotator.LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized') print(f"\n\t5. Attaching LemmatizerModel Annotator to the pipeline") if import_c: import nltk nltk.download("popular") from nltk.corpus import stopwords eng_stopwords = stopwords.words('english') print(f"\n\t6. nltk stop-words found") if import_c: from sparknlp.annotator import StopWordsCleaner stopwords_cleaner = sparknlp.annotator.StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords) print(f"\n\t7. Attaching StopWordsCleaner Annotator to the pipeline") if import_c: from sparknlp.annotator import NGramGenerator ngrammer = sparknlp.annotator.NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_') print(f"\n\t8. Attaching NGramGenerator Annotator to the pipeline") if import_c: from sparknlp.annotator import PerceptronModel pos_tagger = sparknlp.annotator.PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos') print(f"\n\t9. Attaching PerceptronModel Annotator to the pipeline") if import_c: from sparknlp.base import Finisher finisher = sparknlp.base.Finisher().setInputCols(['unigrams', 'ngrams','pos']) print(f"\n\t10. Attaching Finisher Transformer to the pipeline") from pyspark.ml import Pipeline pipeline = Pipeline().setStages([documentAssembler, tokenizer, normalizer, lemmatizer, stopwords_cleaner, pos_tagger, ngrammer, finisher]) print("\n\t\t\t ---- Pipeline Built Successfully ----") processed_tweets = pipeline.fit(text_clean).transform(text_clean) print("\n\t\t\t ---- Pipeline Fitted Successfully ----") from pyspark.sql.functions import concat processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams'))) print("\n\tData Concatination done - uni--ngrams") print("\n\t\t\t ---- Loading the Pre-trained Pipeline analyze_sentimentdl_use_twitter----") from sparknlp.pretrained import PretrainedPipeline pipeline_sent = PretrainedPipeline("analyze_sentimentdl_use_twitter", lang="en") pipout_sent_results = pipeline_sent.transform(processed_tweets.withColumnRenamed(text_col, "text")) print("\n\t\t\t ---- Sentiments Fetched Successfully ----\n\n\n") from pyspark.sql.functions import col from pyspark.sql.functions import monotonically_increasing_id, row_number from pyspark.sql.window import Window pipout_sent_results=pipout_sent_results.withColumn("id_tmp",row_number().over(Window.orderBy(monotonically_increasing_id()))) non_null_index=non_null_index.withColumn("id_tmp",row_number().over(Window.orderBy(monotonically_increasing_id()))) print("\n$$$ Indexing done for the Compiled Result") data_op=data.join(non_null_index.join(pipout_sent_results, on=["id_tmp"]).drop("id_tmp"), on=["_c0"], how='left_outer') data_op=data_op.withColumn("_c0", data_op["_c0"].cast(IntegerType())) print("\n$$$ Joining the final resutls with original dataframe") #f**k<<catch this print(f"\nOriginal IP={data.count()} \nNonNull Index={non_null_index.count()} \nNull_Clean={text_clean.count()} \nOriginal OP={data_op.count()}") print(data.show(4)) #print("\t\t\t\t\t CONVERTED TO THIS") final_results = data_op.orderBy("_c0") print("\n$$$ Spark Created") id = list((((final_results.select('str_id')).toPandas())).str_id) createdat = list((((final_results.select('created_at')).toPandas())).created_at) fulltext = list((((final_results.select('full_text')).toPandas())).full_text) favoritecount = list((((final_results.select('favorite_count')).toPandas())).favorite_count) retweetcount = list((((final_results.select('retweet_count')).toPandas())).retweet_count) pipeclean = list((((final_results.select('text')).toPandas())).text) textlen = list(((final_results.select('finished_unigrams')).toPandas()).finished_unigrams.apply(lambda row: int(len(row)))) sentscores = list(((final_results.select('sentiment')).toPandas()).sentiment.apply(lambda row: (((str(row)).split(",")[3]).split("'")[1]))) op_df = p.DataFrame(list(zip(id,createdat,fulltext,favoritecount,retweetcount,pipeclean,textlen,sentscores)), columns = ['str_id','created_at','text_full','favorite_count','retweet_count','text_pipe_clean','text_length','sentiment_score']) print("\n$$$ Pandas Created") print(op_df.head(4)) return op_df