def run_nlp_pipeline(df): """Perform lemmatization using Spark-NLP (add-on library)""" document_assembler = DocumentAssembler() \ .setInputCol("words_joined") # Obtain tokens from a string tokenizer = Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") # Use spaCy lemma dictionary to train Spark NLP lemmatizer lemmatizer = Lemmatizer() \ .setInputCols(["token"]) \ .setOutputCol("lemma") \ .setDictionary(LEMMAS, key_delimiter="->", value_delimiter="\s+", read_as="TEXT") finisher = Finisher() \ .setInputCols(["lemma"]) \ .setIncludeMetadata(False) nlpPipeline = Pipeline(stages=[document_assembler, tokenizer, lemmatizer, finisher]) nlpPipelineDF = nlpPipeline.fit(df) \ .transform(df) \ .withColumnRenamed('finished_lemma', 'allTokens') return nlpPipelineDF
def sparknlp_transform(df): documentAssembler = DocumentAssembler() \ .setInputCol('review') \ .setOutputCol('document') tokenizer = Tokenizer() \ .setInputCols(['document']) \ .setOutputCol('token') normalizer = Normalizer() \ .setInputCols(['token']) \ .setOutputCol('normalized') \ .setLowercase(True) lemmatizer = LemmatizerModel.pretrained() \ .setInputCols(['normalized']) \ .setOutputCol('lemma') stopwords_cleaner = StopWordsCleaner() \ .setInputCols(['lemma']) \ .setOutputCol('clean_token') \ .setCaseSensitive(False) \ .setStopWords(eng_stopwords) # finisher converts tokens to human-readable output finisher = Finisher() \ .setInputCols(['clean_token']) \ .setCleanAnnotations(True) pipeline = Pipeline() \ .setStages([ documentAssembler, tokenizer, normalizer, lemmatizer, stopwords_cleaner, finisher ]) data = pipeline.fit(df).transform(df) return data
def LDA_pipefit (data_ip, ipcol): text_col = ipcol from sparknlp.base import DocumentAssembler documentAssembler = DocumentAssembler().setInputCol(text_col).setOutputCol('document') from sparknlp.annotator import Tokenizer tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized') from sparknlp.annotator import Normalizer normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True) from sparknlp.annotator import LemmatizerModel lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized') from sparknlp.annotator import StopWordsCleaner stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords) from sparknlp.annotator import NGramGenerator ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_') from sparknlp.annotator import PerceptronModel pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos') from sparknlp.base import Finisher finisher = Finisher().setInputCols(['unigrams', 'ngrams','pos']) from pyspark.ml import Pipeline pipeline = Pipeline().setStages([documentAssembler, tokenizer, normalizer, lemmatizer, stopwords_cleaner, pos_tagger, ngrammer, finisher]) review_text_clean = ipcol processed_tweets = pipeline.fit(data_ip).transform(data_ip) from pyspark.sql.functions import concat processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams'))) from pyspark.ml.feature import CountVectorizer tfizer = CountVectorizer(inputCol='final',outputCol='tf_features') tf_model = tfizer.fit(processed_tweets) tf_result = tf_model.transform(processed_tweets) from pyspark.ml.feature import IDF idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features') idf_model = idfizer.fit(tf_result) tfidf_result = idf_model.transform(tf_result) from pyspark.ml.clustering import LDA num_topics = 3 max_iter = 10 lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features') lda_model = lda.fit(tfidf_result) from pyspark.sql import types as T vocab = tf_model.vocabulary def get_words(token_list): return [vocab[token_id] for token_id in token_list] udf_to_words = F.udf(get_words, T.ArrayType(T.StringType())) num_top_words = 15 topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices'))) topics_p=topics.toPandas() return topics_p
def build_data(df): document_assembler1 = DocumentAssembler() \ .setInputCol('question1').setOutputCol('document1') tokenizer1 = Tokenizer() \ .setInputCols(['document1']) \ .setOutputCol('token1') finisher1 = Finisher() \ .setInputCols(['token1']) \ .setOutputCols(['ntokens1']) \ .setOutputAsArray(True) \ .setCleanAnnotations(True) document_assembler2 = DocumentAssembler() \ .setInputCol('question2').setOutputCol('document2') tokenizer2 = Tokenizer() \ .setInputCols(['document2']) \ .setOutputCol('token2') finisher2 = Finisher() \ .setInputCols(['token2']) \ .setOutputCols(['ntokens2']) \ .setOutputAsArray(True) \ .setCleanAnnotations(True) p_pipeline = Pipeline(stages=[document_assembler1, tokenizer1, finisher1, \ document_assembler2, tokenizer2, finisher2]) p_model = p_pipeline.fit(df) processed1 = p_model.transform(df) label1 = processed1.select('is_duplicate').collect() label_array1 = np.array(label1) label_array1 = label_array1.astype(np.int) return processed1, label_array1
def setup_sentiment_pipeline(): lexicon = 'lexicon.txt' document_assembler = DocumentAssembler().setInputCol( "rawDocument").setOutputCol("document").setIdCol("sentence_id") sentence_detector = SentenceDetector().setInputCols( ["document"]).setOutputCol("sentence") tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token") lemmatizer = Lemmatizer().setInputCols([ "token" ]).setOutputCol("lemma").setDictionary("txt/corpus/lemmas_small.txt", key_delimiter="->", value_delimiter="\t") sentiment_detector = SentimentDetector().setInputCols( ["lemma", "sentence"]).setOutputCol("sentiment_score").setDictionary( "txt/corpus/{0}".format(lexicon), ",") finisher = Finisher().setInputCols(["sentiment_score" ]).setOutputCols(["sentiment"]) pipeline = Pipeline(stages=[ document_assembler, sentence_detector, tokenizer, lemmatizer, sentiment_detector, finisher ]) return pipeline
def spark_nlp_sentiment_analysis(self): """ transform reviews with tokenization, normalization, lemmatization and sentiment dict calculate sentiment score and aggregate with business ID """ lemma_file = "s3a://{}/{}/{}".format( self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"], self.s3_config["LEMMA_FILE"]) sentiment_file = "s3a://{}/{}/{}".format( self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"], self.s3_config["SENTIMENT_FILE"]) yelp_rating_filename = "s3a://{}/{}/{}".format( self.s3_config["BUCKET"], self.s3_config["YELP_FOLDER"], self.s3_config["YELP_REVIEW_DATA_FILE"]) self.df_yelp_review = self.spark.read.json(yelp_rating_filename) self.df_yelp_review = self.df_yelp_review \ .select("user_id", "business_id", "stars", "text") \ .withColumnRenamed("stars", "ratings") self.df_id_filter = self.df_ranking.select("business_id") self.df_yelp_review = self.df_yelp_review \ .join(self.df_id_filter, self.df_yelp_review.business_id == self.df_id_filter.business_id, 'inner') \ .drop(self.df_id_filter.business_id) document_assembler = DocumentAssembler() \ .setInputCol("text") sentence_detector = SentenceDetector() \ .setInputCols(["document"]) \ .setOutputCol("sentence") tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") normalizer = Normalizer() \ .setInputCols(["token"]) \ .setOutputCol("normal") lemmatizer = Lemmatizer() \ .setInputCols(["token"]) \ .setOutputCol("lemma") \ .setDictionary(lemma_file, key_delimiter="->", value_delimiter="\t") sentiment_detector = SentimentDetector() \ .setInputCols(["lemma", "sentence"]) \ .setOutputCol("sentiment_score") \ .setDictionary(sentiment_file, delimiter=",") finisher = Finisher() \ .setInputCols(["sentiment_score"]) \ .setOutputCols(["sentiment"]) pipeline = Pipeline(stages=[ document_assembler, \ sentence_detector, \ tokenizer, \ normalizer, \ lemmatizer, \ sentiment_detector, \ finisher ]) self.df_sentiment = pipeline \ .fit(self.df_yelp_review) \ .transform(self.df_yelp_review) self.df_sentiment.cache() self.df_sentiment = self.df_sentiment \ .select(self.df_sentiment.business_id, functions.when(self.df_sentiment.sentiment == "positive", 1).when(self.df_sentiment.sentiment == "negative", -1).otherwise(0))\ .withColumnRenamed("CASE WHEN (sentiment = positive) THEN 1 WHEN (sentiment = negative) THEN -1 ELSE 0 END", "sentiment") self.df_sentiment = self.df_sentiment \ .groupby("business_id") \ .agg({"sentiment": "mean"}) \ .withColumnRenamed("avg(sentiment)", "avg_sentiment_score")
# While slightly more involved than this, this is effectively taking a string and splitting # it along ths spaces, so each word is its own string. The data then becomes the # spark-nlp native type "Token". tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") # The Normalizer will group words together based on similar semantic meaning. normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalizer") # The Stemmer takes objects of class "Token" and converts the words into their # root meaning. For instance, the words "cars", "cars'" and "car's" would all be replaced # with the word "car". stemmer = Stemmer().setInputCols(["normalizer"]).setOutputCol("stem") # The Finisher signals to spark-nlp allows us to access the data outside of spark-nlp # components. For instance, we can now feed the data into components from Spark MLlib. finisher = Finisher().setInputCols(["stem"]).setOutputCols( ["to_spark"]).setValueSplitSymbol(" ") # Stopwords are common words that generally don't add much detail to the meaning # of a body of text. In English, these are mostly "articles" such as the words "the" # and "of". stopword_remover = StopWordsRemover(inputCol="to_spark", outputCol="filtered") # Here we implement TF-IDF as an input to our LDA model. CountVectorizer (TF) keeps track # of the vocabulary that's being created so we can map our topics back to their # corresponding words. # TF (term frequency) creates a matrix that counts how many times each word in the # vocabulary appears in each body of text. This then gives each word a weight based # on it's frequency. tf = CountVectorizer(inputCol="filtered", outputCol="raw_features") # Here we implement the IDF portion. IDF (Inverse document frequency) reduces
tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") lemmatizer = Lemmatizer() \ .setInputCols(["token"]) \ .setOutputCol("lemma") \ .setDictionary("/tmp/lemmas_small.txt", key_delimiter="->", value_delimiter="\t") sentiment_detector = SentimentDetector() \ .setInputCols(["lemma", "sentence"]) \ .setOutputCol("sentiment_score") \ .setDictionary("/tmp/default-sentiment-dict.txt", ",") finisher = Finisher() \ .setInputCols(["sentiment_score"]) \ .setOutputCols(["sentiment"]) # COMMAND ---------- # MAGIC %md #### 4. Train the pipeline, which is only being trained from external resources, not from the dataset we pass on. The prediction runs on the target dataset # COMMAND ---------- pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, lemmatizer, sentiment_detector, finisher]) model = pipeline.fit(data) result = model.transform(data) # COMMAND ---------- # MAGIC %md #### 5. filter the finisher output, to find the positive sentiment lines
.setInputCols(['lemmatized']) \ .setOutputCol('cleaned_lemmatized') \ .setStopWords(stopwords) ngrammer = NGramGenerator() \ .setInputCols(['lemmatized']) \ .setOutputCol('ngrams') \ .setN(3) \ .setEnableCumulative(True) \ .setDelimiter('_') pos_tagger = PerceptronModel.pretrained('pos_anc') \ .setInputCols(['document', 'lemmatized']) \ .setOutputCol('pos') finisher = Finisher() \ .setInputCols(['unigrams', 'ngrams', 'pos']) pipeline = Pipeline() \ .setStages([document_assembler, tokenizer, normalizer, lemmatizer, stopwords_cleaner, pos_tagger, ngrammer, finisher]) processed_text = pipeline.fit(text_data).transform(text_data)