def sparknlp_transform(df): documentAssembler = DocumentAssembler() \ .setInputCol('review') \ .setOutputCol('document') tokenizer = Tokenizer() \ .setInputCols(['document']) \ .setOutputCol('token') normalizer = Normalizer() \ .setInputCols(['token']) \ .setOutputCol('normalized') \ .setLowercase(True) lemmatizer = LemmatizerModel.pretrained() \ .setInputCols(['normalized']) \ .setOutputCol('lemma') stopwords_cleaner = StopWordsCleaner() \ .setInputCols(['lemma']) \ .setOutputCol('clean_token') \ .setCaseSensitive(False) \ .setStopWords(eng_stopwords) # finisher converts tokens to human-readable output finisher = Finisher() \ .setInputCols(['clean_token']) \ .setCleanAnnotations(True) pipeline = Pipeline() \ .setStages([ documentAssembler, tokenizer, normalizer, lemmatizer, stopwords_cleaner, finisher ]) data = pipeline.fit(df).transform(df) return data
def run_nlp_pipeline(df): """Perform lemmatization using Spark-NLP (add-on library)""" document_assembler = DocumentAssembler() \ .setInputCol("words_joined") # Obtain tokens from a string tokenizer = Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") # Use spaCy lemma dictionary to train Spark NLP lemmatizer lemmatizer = Lemmatizer() \ .setInputCols(["token"]) \ .setOutputCol("lemma") \ .setDictionary(LEMMAS, key_delimiter="->", value_delimiter="\s+", read_as="TEXT") finisher = Finisher() \ .setInputCols(["lemma"]) \ .setIncludeMetadata(False) nlpPipeline = Pipeline(stages=[document_assembler, tokenizer, lemmatizer, finisher]) nlpPipelineDF = nlpPipeline.fit(df) \ .transform(df) \ .withColumnRenamed('finished_lemma', 'allTokens') return nlpPipelineDF
def LDA_pipefit (data_ip, ipcol): text_col = ipcol from sparknlp.base import DocumentAssembler documentAssembler = DocumentAssembler().setInputCol(text_col).setOutputCol('document') from sparknlp.annotator import Tokenizer tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized') from sparknlp.annotator import Normalizer normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True) from sparknlp.annotator import LemmatizerModel lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized') from sparknlp.annotator import StopWordsCleaner stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords) from sparknlp.annotator import NGramGenerator ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_') from sparknlp.annotator import PerceptronModel pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos') from sparknlp.base import Finisher finisher = Finisher().setInputCols(['unigrams', 'ngrams','pos']) from pyspark.ml import Pipeline pipeline = Pipeline().setStages([documentAssembler, tokenizer, normalizer, lemmatizer, stopwords_cleaner, pos_tagger, ngrammer, finisher]) review_text_clean = ipcol processed_tweets = pipeline.fit(data_ip).transform(data_ip) from pyspark.sql.functions import concat processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams'))) from pyspark.ml.feature import CountVectorizer tfizer = CountVectorizer(inputCol='final',outputCol='tf_features') tf_model = tfizer.fit(processed_tweets) tf_result = tf_model.transform(processed_tweets) from pyspark.ml.feature import IDF idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features') idf_model = idfizer.fit(tf_result) tfidf_result = idf_model.transform(tf_result) from pyspark.ml.clustering import LDA num_topics = 3 max_iter = 10 lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features') lda_model = lda.fit(tfidf_result) from pyspark.sql import types as T vocab = tf_model.vocabulary def get_words(token_list): return [vocab[token_id] for token_id in token_list] udf_to_words = F.udf(get_words, T.ArrayType(T.StringType())) num_top_words = 15 topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices'))) topics_p=topics.toPandas() return topics_p
def build_data(df): document_assembler1 = DocumentAssembler() \ .setInputCol('question1').setOutputCol('document1') tokenizer1 = Tokenizer() \ .setInputCols(['document1']) \ .setOutputCol('token1') finisher1 = Finisher() \ .setInputCols(['token1']) \ .setOutputCols(['ntokens1']) \ .setOutputAsArray(True) \ .setCleanAnnotations(True) document_assembler2 = DocumentAssembler() \ .setInputCol('question2').setOutputCol('document2') tokenizer2 = Tokenizer() \ .setInputCols(['document2']) \ .setOutputCol('token2') finisher2 = Finisher() \ .setInputCols(['token2']) \ .setOutputCols(['ntokens2']) \ .setOutputAsArray(True) \ .setCleanAnnotations(True) p_pipeline = Pipeline(stages=[document_assembler1, tokenizer1, finisher1, \ document_assembler2, tokenizer2, finisher2]) p_model = p_pipeline.fit(df) processed1 = p_model.transform(df) label1 = processed1.select('is_duplicate').collect() label_array1 = np.array(label1) label_array1 = label_array1.astype(np.int) return processed1, label_array1
def tokenize(self, event): print('entered tokenizer') documentAssembler = DocumentAssembler() documentAssembler.setInputCol('text') documentAssembler.setOutputCol('document') self.spark_df = self.spark.createDataFrame(self.df.astype(str)) self.spark_df=documentAssembler.transform(self.spark_df) tokenizer = Tokenizer() tokenizer.setInputCols(['document']) tokenizer.setOutputCol('token') tokenizer.setTargetPattern(self.search_pattern_input.value) token_df=tokenizer.fit(self.spark_df) current_df = token_df.transform(self.spark_df) self.spark_df = current_df self.display_df = get_all_lines(self.spark_df, 'token.result', col = 'token') self.continue_button.disabled = False
class ResourceDownloader(object): factory = { DocumentAssembler.reader: lambda: DocumentAssembler(), SentenceDetector.reader: lambda: SentenceDetector(), Tokenizer.reader: lambda: Tokenizer(), PerceptronModel.reader: lambda: PerceptronModel(), NerCrfModel.reader: lambda: NerCrfModel() } def downloadModel(self, reader, name, language): j_obj = _internal._DownloadModel(reader, name, language).apply() py_obj = self.factory[reader]() py_obj._java_obj = j_obj return py_obj def downloadPipeline(self, name, language): j_obj = _internal._DownloadPipeline(name, language).apply() jmodel = JavaModel() jmodel._java_obj = j_obj return jmodel
def setup_sentiment_pipeline(): lexicon = 'lexicon.txt' document_assembler = DocumentAssembler().setInputCol( "rawDocument").setOutputCol("document").setIdCol("sentence_id") sentence_detector = SentenceDetector().setInputCols( ["document"]).setOutputCol("sentence") tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token") lemmatizer = Lemmatizer().setInputCols([ "token" ]).setOutputCol("lemma").setDictionary("txt/corpus/lemmas_small.txt", key_delimiter="->", value_delimiter="\t") sentiment_detector = SentimentDetector().setInputCols( ["lemma", "sentence"]).setOutputCol("sentiment_score").setDictionary( "txt/corpus/{0}".format(lexicon), ",") finisher = Finisher().setInputCols(["sentiment_score" ]).setOutputCols(["sentiment"]) pipeline = Pipeline(stages=[ document_assembler, sentence_detector, tokenizer, lemmatizer, sentiment_detector, finisher ]) return pipeline
def simplePipeline(): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") sentenceDetector = SentenceDetector() \ .setInputCols(["document"]) \ .setOutputCol("sentences") tokenizer = Tokenizer() \ .setInputCols(["sentences"]) \ .setOutputCol("token") normalizer = Normalizer() \ .setInputCols(["token"]) \ .setOutputCol("normal") word_embeddings = WordEmbeddingsModel.pretrained() \ .setInputCols(["document", "normal"]) \ .setOutputCol("embeddings") nlpPipeline = Pipeline(stages=[ document_assembler, sentenceDetector, tokenizer, normalizer, word_embeddings, ]) return nlpPipeline
def spark_nlp_sentiment_analysis(self): """ transform reviews with tokenization, normalization, lemmatization and sentiment dict calculate sentiment score and aggregate with business ID """ lemma_file = "s3a://{}/{}/{}".format( self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"], self.s3_config["LEMMA_FILE"]) sentiment_file = "s3a://{}/{}/{}".format( self.s3_config["BUCKET"], self.s3_config["TEXT_CORPUS_FOLDER"], self.s3_config["SENTIMENT_FILE"]) yelp_rating_filename = "s3a://{}/{}/{}".format( self.s3_config["BUCKET"], self.s3_config["YELP_FOLDER"], self.s3_config["YELP_REVIEW_DATA_FILE"]) self.df_yelp_review = self.spark.read.json(yelp_rating_filename) self.df_yelp_review = self.df_yelp_review \ .select("user_id", "business_id", "stars", "text") \ .withColumnRenamed("stars", "ratings") self.df_id_filter = self.df_ranking.select("business_id") self.df_yelp_review = self.df_yelp_review \ .join(self.df_id_filter, self.df_yelp_review.business_id == self.df_id_filter.business_id, 'inner') \ .drop(self.df_id_filter.business_id) document_assembler = DocumentAssembler() \ .setInputCol("text") sentence_detector = SentenceDetector() \ .setInputCols(["document"]) \ .setOutputCol("sentence") tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") normalizer = Normalizer() \ .setInputCols(["token"]) \ .setOutputCol("normal") lemmatizer = Lemmatizer() \ .setInputCols(["token"]) \ .setOutputCol("lemma") \ .setDictionary(lemma_file, key_delimiter="->", value_delimiter="\t") sentiment_detector = SentimentDetector() \ .setInputCols(["lemma", "sentence"]) \ .setOutputCol("sentiment_score") \ .setDictionary(sentiment_file, delimiter=",") finisher = Finisher() \ .setInputCols(["sentiment_score"]) \ .setOutputCols(["sentiment"]) pipeline = Pipeline(stages=[ document_assembler, \ sentence_detector, \ tokenizer, \ normalizer, \ lemmatizer, \ sentiment_detector, \ finisher ]) self.df_sentiment = pipeline \ .fit(self.df_yelp_review) \ .transform(self.df_yelp_review) self.df_sentiment.cache() self.df_sentiment = self.df_sentiment \ .select(self.df_sentiment.business_id, functions.when(self.df_sentiment.sentiment == "positive", 1).when(self.df_sentiment.sentiment == "negative", -1).otherwise(0))\ .withColumnRenamed("CASE WHEN (sentiment = positive) THEN 1 WHEN (sentiment = negative) THEN -1 ELSE 0 END", "sentiment") self.df_sentiment = self.df_sentiment \ .groupby("business_id") \ .agg({"sentiment": "mean"}) \ .withColumnRenamed("avg(sentiment)", "avg_sentiment_score")
# Change the name of the new column ).alias("text") ) ) # Now, we begin assembling our pipeline. Each component here is used to some transformation to the data. # The Document Assembler takes the raw text data and convert it into a format that can # be tokenized. It becomes one of spark-nlp native object types, the "Document". document_assembler = DocumentAssembler().setInputCol("text").setOutputCol( "document") # The Tokenizer takes data that is of the "Document" type and tokenizes it. # While slightly more involved than this, this is effectively taking a string and splitting # it along ths spaces, so each word is its own string. The data then becomes the # spark-nlp native type "Token". tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") # The Normalizer will group words together based on similar semantic meaning. normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalizer") # The Stemmer takes objects of class "Token" and converts the words into their # root meaning. For instance, the words "cars", "cars'" and "car's" would all be replaced # with the word "car". stemmer = Stemmer().setInputCols(["normalizer"]).setOutputCol("stem") # The Finisher signals to spark-nlp allows us to access the data outside of spark-nlp # components. For instance, we can now feed the data into components from Spark MLlib. finisher = Finisher().setInputCols(["stem"]).setOutputCols( ["to_spark"]).setValueSplitSymbol(" ") # Stopwords are common words that generally don't add much detail to the meaning
LANG = "english" spark = sparknlp.start() path = 'Some path' data = spark.read.csv(path, header=True) text_col = 'sentences' text_data = data.select(text_col).filter(F.col(text_col).isNotNull()) document_assembler = DocumentAssembler() \ .setInputCol(text_col) \ .setOutputCol("document") tokenizer = Tokenizer() \ .setInputCols(['document']) \ .setOutputCol('tokens') normalizer = Normalizer() \ .setInputCols(['tokens']) \ .setOutputCol('normalized') \ .setLowercase(True) lemmatizer = LemmatizerModel.pretrained() \ .setInputCols(['normalized']) \ .setOutputCol('lemmatized') stopwords = stopwords.words(LANG) stopwords_cleaner = StopWordsCleaner() \ .setInputCols(['lemmatized']) \