def __init__(self, rep_type: str = 'synonym', use_ner: bool = True): """Instantiate a ReplaceTerms object Parameters ---------- rep_type : Optional(str) The type of target perturbation. May include `synonym` for word2vec replacement, `mlmsynonym` for MLM-based replacement, or `misspelling` for misspelling replacement. use_ner : Optional(bool) Flag specifying whether to use entity-aware replacement. If True, when calculating the sampling weights for any perturbation, named entities will be zeroed. In this case, the NER model is loaded here. The default value is True. """ self.use_ner = use_ner if SPARK_NLP_ENABLED else False self.rep_type = rep_type if rep_type not in ['synonym', 'misspelling', 'mlmsynonym']: logger.error('{}:ReplaceTerms __init__ invalid rep_type'.format( __file__.split('/')[-1])) raise ValueError('Not an accepted generator type') self._generator = self._get_generator(rep_type) if not self._generator: raise RuntimeError('Unable to init generator') if self.use_ner: try: spark = sparknlp.start() self._ner_pipeline = PretrainedPipeline( 'recognize_entities_dl', lang='en') except Exception as e: logger.error( '{}:ReplaceTerms __init__ invalid rep_type'.format( __file__.split('/')[-1])) raise RuntimeError('Unable to load ner pkg')
def test_API(): from sparknlp.pretrained import PretrainedPipeline ner_pipeline = PretrainedPipeline("ner_model_finder", "en", "clinical/models") result = ner_pipeline.annotate("medication") print(result) return result
def main(): spark, sc = init_spark() # Download a pre-trained pipeline pipeline = PretrainedPipeline('explain_document_dl', lang='en') # Your testing dataset text = """ The Mona Lisa is a 16th century oil painting created by Leonardo. It's held at the Louvre in Paris. """ # Annotate your testing dataset result = pipeline.annotate(text) # What's in the pipeline print(list(result.keys())) print(result['entities'])
def construct_component_from_pipe_identifier(language, sparknlp_reference): ''' # creates a list of components from a Spark NLP Pipeline reference # 1. download pipeline # 2. unpack pipeline to annotators and create list of nlu components # 3. return list of nlu components :param language: language of the pipeline :param sparknlp_reference: Reference to a spark nlp petrained pipeline :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list ''' logger.info("Starting Spark NLP to NLU pipeline conversion process") from sparknlp.pretrained import PretrainedPipeline if 'language' in sparknlp_reference : language='xx' #special edge case for lang detectors pipe = PretrainedPipeline(sparknlp_reference, lang=language) constructed_components = [] for component in pipe.light_model.pipeline_model.stages: logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component) parsed='' parsed = str(component).split('_')[0].lower() logger.info("Parsed Component for : %s", parsed) if 'NerConverter' in component.name : constructed_components.append(Util(component_name='ner_converter', model=component)) elif parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) elif parsed == 'document': constructed_components.append(nlu.Util(model=component)) elif parsed == 'sentence': constructed_components.append(nlu.Util(component_name='sentence_detector',model=component)) # todo differentiate normal and deep detector elif parsed == 'regex': constructed_components.append(nlu.Matcher(component_name='regex', model=component)) elif parsed == 'text': constructed_components.append(nlu.Matcher(model=component)) elif parsed == 'spell': constructed_components.append(nlu.SpellChecker(model=component)) elif parsed == 'lemmatizer': constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component)) elif parsed == 'normalizer': constructed_components.append(nlu.lemmatizer.Normalizer(model=component)) elif parsed == 'stemmer': constructed_components.append(nlu.stemmer.Stemmer(model=component)) elif parsed == 'pos' or parsed =='language': constructed_components.append(nlu.Classifier(model=component)) elif parsed == 'word': constructed_components.append(nlu.Embeddings(model=component)) elif parsed == 'ner' or parsed == 'nerdlmodel': constructed_components.append(nlu.Classifier(component_name='ner',model=component)) elif parsed == 'dependency': constructed_components.append(nlu.Util(model=component)) elif parsed == 'typed': constructed_components.append(nlu.Util(model=component)) # todo util abuse elif parsed == 'multi': constructed_components.append(nlu.Util(model=component)) # todo util abuse elif parsed == 'sentimentdlmodel': constructed_components.append(nlu.Classifier(model=component)) elif parsed in ['universal','bert','albert', 'elmo', 'xlnet', 'glove','electra','covidbert','small_bert',''] : constructed_components.append(nlu.Embeddings(model=component)) elif parsed == 'vivekn': constructed_components.append(nlu.Classifier(component_name='vivekn', model=component)) elif parsed == 'chunker': constructed_components.append(nlu.chunker.Chunker(model=component)) elif parsed == 'ngram': constructed_components.append(nlu.chunker.Chunker(model=component)) elif '2e2' in parsed: constructed_components.append(nlu.Embeddings(model=component)) elif parsed == 'embeddings_chunk': constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component)) elif parsed == 'stopwords': constructed_components.append(nlu.StopWordsCleaner(model=component)) logger.info("Extracted into NLU Component type : %s", parsed) if None in constructed_components : logger.exception("EXCEPTION: Could not infer component type for lang=%s and sparknlp_reference=%s during pipeline conversion,", language,sparknlp_reference) return None return constructed_components
# Import Spark NLP from sparknlp.base import * from sparknlp.annotator import * from sparknlp.embeddings import * from sparknlp.pretrained import PretrainedPipeline import sparknlp from pyspark.sql import SparkSession spark = SparkSession.builder \ .appName("Spark NLP")\ .master("local[4]")\ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "2G") \ .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\ .config("spark.kryoserializer.buffer.max", "1000M")\ .getOrCreate() # Download a pre-trained pipeline pipeline = PretrainedPipeline('explain_document_dl', lang='en') # Your testing dataset text = """ The Mona Lisa is a 16th century oil painting created by Leonardo. It’s held at the Louvre in Paris. """ # Annotate your testing dataset result = pipeline.annotate(text) # What’s in the pipeline print(list(result.keys())) print(result['entities'])
from pyspark.sql import functions as F from sparknlp.annotator import * from sparknlp_jsl.annotator import * from sparknlp.base import * import sparknlp_jsl import sparknlp import warnings warnings.filterwarnings('ignore') params = { "spark.driver.memory": "16G", "spark.kryoserializer.buffer.max": "2000M", "spark.driver.maxResultSize": "2000M" } print("Spark NLP Version :", sparknlp.version()) print("Spark NLP_JSL Version :", sparknlp_jsl.version()) spark = sparknlp_jsl.start(SECRET, params=params) from sparknlp.pretrained import PretrainedPipeline ner_pipeline = PretrainedPipeline("ner_model_finder", "en", "clinical/models") result = ner_pipeline.annotate("medication") print(100 * '-') print(result) print(100 * '-')
##LEMATIZATION import sparknlp sparknlp.start() from sparknlp.pretrained import PretrainedPipeline from pyspark.sql.types import StringType from pyspark.sql.functions import udf from pyspark.sql.functions import lower, col from pyspark.ml.feature import StringIndexer pipeline = PretrainedPipeline('explain_document_ml', 'en') s_df2 = pipeline.annotate(s_df, "review_text") s_df2 = s_df2.drop( *["document", "sentence", "token", "spell", "stems", "pos", "text"]) def mkString(line): return " ".join([str(x[3]) for x in line]) string_udf = udf(lambda z: mkString(z), StringType()) s_df2 = s_df2.withColumn("lemmatizedText", string_udf("lemmas")) s_df2 = s_df2.withColumn("lemmatizedText", lower(col("lemmatizedText"))) # define processing 4 steps and execute them with a trsnformation pipeline from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF from pyspark.ml import Pipeline ##LEMATIZATION # 1. Tokenizer, .setPattern("\\p{L}+") means that it remove accent from words (check it has no impact on the smileys !!!) tokenizer = RegexTokenizer().setGaps(False)\
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref): ''' # creates a list of components from a Spark NLP Pipeline reference # 1. download pipeline # 2. unpack pipeline to annotators and create list of nlu components # 3. return list of nlu components :param language: language of the pipeline :param nlp_ref: Reference to a spark nlp petrained pipeline :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list ''' logger.info("Starting Spark NLP to NLU pipeline conversion process") from sparknlp.pretrained import PretrainedPipeline if 'language' in nlp_ref: language = 'xx' # special edge case for lang detectors pipe = PretrainedPipeline(nlp_ref, lang=language) constructed_components = [] for component in pipe.light_model.pipeline_model.stages: logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component) parsed = str(component).split('_')[0].lower() logger.info("Parsed Component for : %s", parsed) c_name = component.__class__.__name__ if c_name == 'NerConverter': constructed_components.append(Util(annotator_class='ner_converter', model=component)) elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings: constructed_components.append(nlu.Embeddings(model=component)) elif parsed in NameSpace.classifiers: constructed_components.append(nlu.Classifier(model=component)) elif c_name == 'TokenizerModel' and parsed !='regex': constructed_components.append(nlu.Tokenizer(model=component)) elif c_name == 'TokenizerModel': constructed_components.append(nlu.Tokenizer(model=component,annotator_class='regex_tokenizer')) elif parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) elif parsed == 'document': constructed_components.append(nlu.Util(model=component)) elif parsed == 'sentence': constructed_components.append(nlu.Util(annotator_class='sentence_detector', model=component)) elif parsed == 'regex': constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed)) elif parsed == 'date': constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed)) elif parsed == 'text': constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed)) elif parsed == 'spell': constructed_components.append(nlu.SpellChecker(model=component)) elif parsed == 'lemmatizer': constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component)) elif parsed == 'normalizer': constructed_components.append(nlu.normalizer.Normalizer(model=component)) elif parsed == 'stemmer': constructed_components.append(nlu.stemmer.Stemmer(model=component)) elif c_name == 'PerceptronModel': constructed_components.append(nlu.Classifier(annotator_class='classifierdl', model=component)) elif c_name == 'ClassifierDLModel': constructed_components.append(nlu.Classifier(annotator_class='language_detector', model=component)) elif parsed == 'word': constructed_components.append(nlu.Embeddings(model=component)) elif parsed == 'ner' or parsed == 'nerdlmodel': constructed_components.append(nlu.Classifier(model=component)) elif parsed == 'dependency': constructed_components.append(nlu.Util(model=component)) elif parsed == 'typed': constructed_components.append(nlu.UnlabledDepParser(model=component)) elif parsed == 'multi': constructed_components.append(nlu.Classifier(model=component)) elif parsed == 'sentimentdlmodel': constructed_components.append(nlu.Classifier(model=component)) elif parsed == 'chunker': constructed_components.append(nlu.chunker.Chunker(model=component)) elif parsed == 'ngram': constructed_components.append(nlu.chunker.Chunker(model=component)) elif parsed == 'embeddings_chunk': constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component)) elif parsed == 'stopwords': constructed_components.append(nlu.StopWordsCleaner(model=component)) else: logger.exception( "EXCEPTION: Could not infer component type for lang=%s and nlp_ref=%s during pipeline conversion,", language, nlp_ref) logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue") constructed_components.append(nlu.normalizer.Normalizer(model=component)) logger.info("Extracted into NLU Component type : %s", parsed) if None in constructed_components: logger.exception( "EXCEPTION: Could not infer component type for lang=%s and nlp_ref=%s during pipeline conversion,", language, nlp_ref) return None return constructed_components
df.cache().count() # COMMAND ---------- display(df.select("text")) # COMMAND ---------- # MAGIC %md #### Extraction # COMMAND ---------- import sparknlp from sparknlp.pretrained import PretrainedPipeline pipeline = PretrainedPipeline('recognize_entities_dl_noncontrib', 'en') result = pipeline.annotate(df, column = 'text') # COMMAND ---------- result.cache().count() # COMMAND ---------- from pyspark.sql.functions import explode, col from wordcloud import WordCloud,STOPWORDS import matplotlib.pyplot as plt exploded = result.select(explode(col('entities.result')).alias("entities"))
def load_pipeline(name): #if name=='match_datetime': #return light_Datetime #else: return PretrainedPipeline(name, lang='en')
def construct_component_from_pipe_identifier( language, nlp_ref, nlu_ref, path=None, is_licensed=False): # -> NLUPipeline """ creates a list of components from a Spark NLP Pipeline reference 1. download pipeline 2. unpack pipeline to annotators and create list of nlu components 3. return list of nlu components :param is_licensed: Weather pipe is licensed or not :param nlu_ref: Nlu ref that points to this pipe :param language: language of the pipeline :param nlp_ref: Reference to a spark nlp pretrained pipeline :param path: Load component_list from HDD :return: Each element of the Spark NLP pipeline wrapped as a NLU component inside a list """ if 'language' in nlp_ref: # special edge case for lang detectors language = 'xx' if path is None: if is_licensed: pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/models') else: pipe = PretrainedPipeline(nlp_ref, lang=language) iterable_stages = pipe.light_model.pipeline_model.stages else: pipe = LightPipeline(PipelineModel.load(path=path)) iterable_stages = pipe.pipeline_model.stages constructed_components = [] os_annos = AnnoClassRef.get_os_pyclass_2_anno_id_dict() hc_annos = AnnoClassRef.get_hc_pyclass_2_anno_id_dict() ocr_annos = AnnoClassRef.get_ocr_pyclass_2_anno_id_dict() for jsl_anno_object in iterable_stages: anno_class_name = type(jsl_anno_object).__name__ logger.info( f"Extracting model from Spark NLP pipeline: obj= {jsl_anno_object} class_name = {anno_class_name} and creating Component" ) if anno_class_name in os_annos.keys(): jsl_anno_id = os_annos[anno_class_name] nlu_component = ComponentMap.os_components[jsl_anno_id] nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref, language, True, Licenses.open_source) constructed_components.append(nlu_component) elif anno_class_name in hc_annos.keys(): # Licensed HC jsl_anno_id = hc_annos[anno_class_name] nlu_component = ComponentMap.hc_components[jsl_anno_id] nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref, language, True, Licenses.hc) constructed_components.append(nlu_component) elif anno_class_name in ocr_annos: # Licensed OCR (WIP) jsl_anno_id = ocr_annos[anno_class_name] nlu_component = ComponentMap.ocr_components[jsl_anno_id] nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref, language, True, Licenses.ocr) constructed_components.append(nlu_component) else: raise ValueError( f'Could not find matching nlu component for annotator class = {anno_class_name}' ) if None in constructed_components or len(constructed_components) == 0: raise Exception( f"Failure inferring type anno_class={anno_class_name} ") return ComponentUtils.set_storage_ref_attribute_of_embedding_converters( PipeUtils.set_column_values_on_components_from_pretrained_pipe( constructed_components, nlp_ref, language, path))
class ReplaceTerms(): """ A class to generate sentence perturbations by replacement ... Methods ---------- replace_terms(sentence, importance_scores, num_replacements, num_output_sents, sampling_strategy, sampling_k) Generate synonyms for an input term """ global SPARK_NLP_ENABLED def __init__(self, rep_type: str = 'synonym', use_ner: bool = True): """Instantiate a ReplaceTerms object Parameters ---------- rep_type : Optional(str) The type of target perturbation. May include `synonym` for word2vec replacement, `mlmsynonym` for MLM-based replacement, or `misspelling` for misspelling replacement. use_ner : Optional(bool) Flag specifying whether to use entity-aware replacement. If True, when calculating the sampling weights for any perturbation, named entities will be zeroed. In this case, the NER model is loaded here. The default value is True. """ self.use_ner = use_ner if SPARK_NLP_ENABLED else False self.rep_type = rep_type if rep_type not in ['synonym', 'misspelling', 'mlmsynonym']: logger.error('{}:ReplaceTerms __init__ invalid rep_type'.format( __file__.split('/')[-1])) raise ValueError('Not an accepted generator type') self._generator = self._get_generator(rep_type) if not self._generator: raise RuntimeError('Unable to init generator') if self.use_ner: try: spark = sparknlp.start() self._ner_pipeline = PretrainedPipeline( 'recognize_entities_dl', lang='en') except Exception as e: logger.error( '{}:ReplaceTerms __init__ invalid rep_type'.format( __file__.split('/')[-1])) raise RuntimeError('Unable to load ner pkg') def _get_entities(self, sentence: str) -> Dict: """ Tokenize and annotate sentence """ if self.use_ner: # Use spark-nlp tokenizer for entity-aware mask allowed_tags = ['PER', 'LOC', 'ORG', 'MISC'] # Annotate your testing dataset result = self._ner_pipeline.annotate(sentence) toks = result['token'] mask = [ 1 if (any([y in x for y in allowed_tags]) or not toks[i].isalnum()) else 0 for i, x in enumerate(result['ner']) ] else: # Use simple NLTK tokenizer toks = nltk.word_tokenize(sentence) mask = [0] * len(toks) return mask, toks def _get_generator(self, name: str = None): if name == 'synonym': try: _syn = SynonymReplace() return _syn except Exception as e: logger.error( '{}:replace_terms: unable to load word vectors'.format( __file__.split('/')[-1])) elif name == 'misspelling': try: _missp = MisspReplace() return _missp except Exception as e: logger.error( '{}:replace_terms: unable to load misspellings'.format( __file__.split('/')[-1])) elif name == 'mlmsynonym': try: _syn = MLMSynonymReplace() return _syn except Exception as e: logger.error( '{}:replace_terms: unable to load word vectors'.format( __file__.split('/')[-1])) return def replace_terms(self, sentence: str, importance_scores: List = None, num_replacements: int = 1, num_output_sents: int = 1, sampling_strategy: str = 'random', sampling_k: int = None) -> List: """Generate a certain number of sentence perturbations by replacement using either misspelling or synonyms Parameters ---------- sentence : str The input sentence to be perturbed. importance_scores : Optional(List) List of tuples defining a weight for each term in the tokenized sentence. These weights are used during sampling to influence perturnation probabilities. If None, uniform sampling is used by default. num_replacements : Optional(int) Target number of terms to replace in the original sentence. The number is chosen randomly using the target as an upper bound, and lower bound of 1. The default is 1. num_output_sents : Optional(int) Target number of perturbed sentences to generate based on the original sentence. The default is 1. sampling_strategy : Optional(str) Strategy used to sample terms to perturb in the original sentence. The default is random. If importance_scores is given, then sampling_strategy may be `topK` or `bottomK`, in which case the importance_scores (or inverted scores) vector is used for weighted sampling. sampling_k : Optional(int) The number of terms in the importance score vector to include in topK or bottomK sampling. This parameter is not used by the default sampling_strategy, `random` sampling. Returns ------- [str] Returns a list of perturbed sentences for the input sentence. Example ------- >>> from term_replacement import ReplaceTerms >>> p = ReplaceTerms(rep_type="synonym") >>> sent = "I was born in a small town" >>> num_terms = 1 >>> num_output_sents = 1 >>> p.generate(sent, num_terms, num_output_sents) ['I born in a small village'] >>> from term_replacement import ReplaceTerms >>> p = ReplaceTerms(rep_type="misspelling") >>> sent = "I was born in a small town" >>> num_terms = 1 >>> num_output_sents = 1 >>> p.generate(sent, num_terms, num_output_sents) ['I born in a smal town'] """ inputs = validate_inputs(num_replacements, num_output_sents, sampling_strategy) num_replacements = inputs.pop(0) num_output_sents = inputs.pop(0) sampling_strategy = inputs.pop(0) # Extract entities in the input sentence to mask masked_vector, tokens = self._get_entities(sentence) # Check if there are enough candidate terms if num_replacements > (len(masked_vector) - sum(masked_vector)): logger.warning( '{}:replace_terms: unable to generate num_replacements - {} of ({})' .format( __file__.split('/')[-1], num_replacements, len(masked_vector) - sum(masked_vector))) num_replacements = len(masked_vector) - sum(masked_vector) if self.rep_type == 'misspelling': remove_stop = True else: remove_stop = False # Initialize sampling scores importance_scores = get_scores(tokens, sampling_strategy, sampling_k, importance_scores, remove_stop) if not importance_scores: return [] # Add index and mask to importance scores term_score_index = [(word[0], i, masked_vector[i]) for i, word in enumerate(importance_scores)] # Store only scores for later sampling importance_scores = [ x[1] if not masked_vector[i] else 0 # set masked scores to zero for i, x in enumerate(importance_scores) ] # Candidate terms for synonym replacement rep_term_indices = [w[1] for w in term_score_index if not w[2]] # Create List of Lists of term variants generated = {x[0]: None for x in term_score_index} generated = { x[0]: self._generator.generate(x[0].lower(), 10, **{ 'toks': tokens, 'token_idx': i }) for i, x in enumerate(term_score_index) if not x[2] } term_variants = { x[0]: generated.get(x[0], []) if (i in rep_term_indices and not masked_vector[i]) else [] for i, x in enumerate(term_score_index) } # Check if there are enough candidate terms if not term_variants: logger.warning( '{}:replace_terms: unable to generate num_variants - {} of ({})' .format( __file__.split('/')[-1], num_replacements, len(term_variants) - sum(masked_vector))) else: term_variants = { k: [x[0].upper() + x[1:] for x in v] if k[0].isupper() else v for k, v in term_variants.items() } # Set scores to zero for all terms w/o synonyms importance_scores = [ x if (term_score_index[i][0] in term_variants and len(term_variants[term_score_index[i][0]]) > 0) else 0 for i, x in enumerate(importance_scores) ] # Renormalize if sum(importance_scores) == 0: return [] # avoid division by 0 error importance_scores = [ x / sum(importance_scores) for x in importance_scores ] # Resize num_replacements to avoid p-sampling errors nonzero_entries = sum([x > 0. for x in importance_scores]) if num_replacements > nonzero_entries: num_replacements = nonzero_entries ''' # DEBUG # Create a List of Lists of all variants candidate_variants = [ v+[k] for k,v in term_variants.items() ] # Check the total number of variants candidate_sents = list( itertools.product(*candidate_variants) ) # Set number of output variants to the total possible if len(candidate_sents) < num_output_sents: num_output_sents = len(candidate_sents) ''' if not term_variants or len([x[2] == 0 for x in term_score_index]) == 0: raise Exception('no term variants or term_score_index') max_attempts = 50 counter = 0 new_sentences = set() while len(new_sentences) < num_output_sents: if counter > max_attempts: break # Select terms to replace rnd_indices = np.random.choice(len(term_score_index), size=num_replacements, replace=False, p=importance_scores) replace_terms = [term_score_index[i][0] for i in rnd_indices] # Create List of Lists of term variants term_combinations = [ term_variants.get(x[0], [x[0]]) if x[0] in replace_terms else [x[0]] for i, x in enumerate(term_score_index) ] # Generate combinatorial variants candidate_sents = list(itertools.product(*term_combinations)) for sent in candidate_sents: new_sentences.add(' '.join(sent)) counter += 1 # Shuffle permutations, sanitize and slice new_sentences = list(new_sentences) random.shuffle(new_sentences) new_sentences = [ re.sub(r'([A-Za-z0-9])(\s+)([^A-Za-z0-9])', r'\1\3', x.replace('\' s ', '\'s ')) for x in new_sentences[:num_output_sents] ] new_sentences = [x for x in new_sentences if x != sentence] if len(new_sentences) < num_output_sents: logger.debug( '{}:replace_terms: unable to generate num_output_sents - {} of ({})' .format( __file__.split('/')[-1], len(new_sentences), num_output_sents)) return new_sentences
text = re.sub('#', '', text) # Removing '#' hash tag text = re.sub('RT[\s]+', '', text) # Removing RT text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink text = re.sub(':', '', text) #remove colon text = re.sub('(\.|\!|\?|\,)', '', text) #remove punctutations return text udf_fun = udf(lambda text: cleantext(text), StringType()) preprocessed_text = twitter_df.select('id', udf_fun('text').alias('text'), 'user') preprocessed_text.show() #use pipeline pipeline = PretrainedPipeline("analyze_sentiment") result = pipeline.annotate(preprocessed_text, column='text') #result.select("sentiment.result").show() #write result to mongodb cols = ['id', 'text', 'sentiment.result', 'user'] output = result.select(cols) #output.show() output.write\ .format("com.mongodb.spark.sql.DefaultSource") \ .mode("append") \ .option("collection", "sentiment_predicted") \ .save()
from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.types import MapType, StringType from sparknlp.base import * from sparknlp.annotator import * from sparknlp.pretrained import PretrainedPipeline import sparknlp import pandas as pd import matplotlib.pyplot as plt import re bp = PretrainedPipeline.from_disk('Explain_document_dl_en') spark.version dir(sparknlp.base) #!pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1 #!pip install spark-nlp #CREATE SPARK SESSION spark = SparkSession.builder \ .appName("Spark NLP")\ .master("local[4]")\ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "2G") \ .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\ .config("spark.kryoserializer.buffer.max", "1000M")\ .getOrCreate() spark = SparkSession \
# Load from Kinesis Stream rawData = spark\ .readStream\ .format("kinesis")\ .option("streamName", "tech-trends-stream")\ .option("endpointUrl", "https://kinesis.eu-west-2.amazonaws.com")\ .load() tweetSchema = StructType() \ .add("text", StringType()) \ .add("hashtags", ArrayType(StringType())) \ # Extract JSON data from Kinesis message tweets = rawData \ .selectExpr("cast (data as STRING) jsonData") \ .select(from_json("jsonData", tweetSchema).alias("tweets")) \ .select('tweets.text') # Load Pipeline and Transform for Sentiment pipeline = PretrainedPipeline("analyze_sentiment", lang="en") sentiments = pipeline.transform(tweets) result = sentiments.select('text', 'sentiment') # Write to JSON in S3 query = sentiments.writeStream\ .format("json")\ .option("path", "s3a://tech-trends-output/sentiments")\ .option("checkpointLocation", "s3a://tech-trends-output/sentiments/checkpoint")\ .start()
def _clean_sent_pipeline (data_ip,input_col, import_c=True): print(f"\t\t\t---- Starting the pipeline built for >>> {input_col} <<< with import condition {import_c} ----") from pyspark.sql import functions as F data=data_ip from pyspark.sql.types import IntegerType data= data.withColumn("_c0", data["_c0"].cast(IntegerType())) text_col = input_col non_null_index = (data.filter(data[text_col].isNotNull())).select('_c0') text_clean = data.select(text_col).filter(F.col(text_col).isNotNull()) print(f"\n\t1. Cleaning the input for Null {data.count()} to {data.count()-non_null_index.count()}") if import_c: from sparknlp.base import DocumentAssembler documentAssembler = sparknlp.base.DocumentAssembler().setInputCol(text_col).setOutputCol('document') print(f"\n\t2. Attaching DocumentAssembler Transformer to the pipeline") if import_c: from sparknlp.annotator import Tokenizer tokenizer = sparknlp.annotator.Tokenizer().setInputCols(['document']).setOutputCol('tokenized') print(f"\n\t3. Attaching Tokenizer Annotator to the pipeline") if import_c: from sparknlp.annotator import Normalizer normalizer = sparknlp.annotator.Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True) print(f"\n\t4. Attaching Normalizer Annotator to the pipeline") if import_c: from sparknlp.annotator import LemmatizerModel lemmatizer = sparknlp.annotator.LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized') print(f"\n\t5. Attaching LemmatizerModel Annotator to the pipeline") if import_c: import nltk nltk.download("popular") from nltk.corpus import stopwords eng_stopwords = stopwords.words('english') print(f"\n\t6. nltk stop-words found") if import_c: from sparknlp.annotator import StopWordsCleaner stopwords_cleaner = sparknlp.annotator.StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords) print(f"\n\t7. Attaching StopWordsCleaner Annotator to the pipeline") if import_c: from sparknlp.annotator import NGramGenerator ngrammer = sparknlp.annotator.NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_') print(f"\n\t8. Attaching NGramGenerator Annotator to the pipeline") if import_c: from sparknlp.annotator import PerceptronModel pos_tagger = sparknlp.annotator.PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos') print(f"\n\t9. Attaching PerceptronModel Annotator to the pipeline") if import_c: from sparknlp.base import Finisher finisher = sparknlp.base.Finisher().setInputCols(['unigrams', 'ngrams','pos']) print(f"\n\t10. Attaching Finisher Transformer to the pipeline") from pyspark.ml import Pipeline pipeline = Pipeline().setStages([documentAssembler, tokenizer, normalizer, lemmatizer, stopwords_cleaner, pos_tagger, ngrammer, finisher]) print("\n\t\t\t ---- Pipeline Built Successfully ----") processed_tweets = pipeline.fit(text_clean).transform(text_clean) print("\n\t\t\t ---- Pipeline Fitted Successfully ----") from pyspark.sql.functions import concat processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams'))) print("\n\tData Concatination done - uni--ngrams") print("\n\t\t\t ---- Loading the Pre-trained Pipeline analyze_sentimentdl_use_twitter----") from sparknlp.pretrained import PretrainedPipeline pipeline_sent = PretrainedPipeline("analyze_sentimentdl_use_twitter", lang="en") pipout_sent_results = pipeline_sent.transform(processed_tweets.withColumnRenamed(text_col, "text")) print("\n\t\t\t ---- Sentiments Fetched Successfully ----\n\n\n") from pyspark.sql.functions import col from pyspark.sql.functions import monotonically_increasing_id, row_number from pyspark.sql.window import Window pipout_sent_results=pipout_sent_results.withColumn("id_tmp",row_number().over(Window.orderBy(monotonically_increasing_id()))) non_null_index=non_null_index.withColumn("id_tmp",row_number().over(Window.orderBy(monotonically_increasing_id()))) print("\n$$$ Indexing done for the Compiled Result") data_op=data.join(non_null_index.join(pipout_sent_results, on=["id_tmp"]).drop("id_tmp"), on=["_c0"], how='left_outer') data_op=data_op.withColumn("_c0", data_op["_c0"].cast(IntegerType())) print("\n$$$ Joining the final resutls with original dataframe") #f**k<<catch this print(f"\nOriginal IP={data.count()} \nNonNull Index={non_null_index.count()} \nNull_Clean={text_clean.count()} \nOriginal OP={data_op.count()}") print(data.show(4)) #print("\t\t\t\t\t CONVERTED TO THIS") final_results = data_op.orderBy("_c0") print("\n$$$ Spark Created") id = list((((final_results.select('str_id')).toPandas())).str_id) createdat = list((((final_results.select('created_at')).toPandas())).created_at) fulltext = list((((final_results.select('full_text')).toPandas())).full_text) favoritecount = list((((final_results.select('favorite_count')).toPandas())).favorite_count) retweetcount = list((((final_results.select('retweet_count')).toPandas())).retweet_count) pipeclean = list((((final_results.select('text')).toPandas())).text) textlen = list(((final_results.select('finished_unigrams')).toPandas()).finished_unigrams.apply(lambda row: int(len(row)))) sentscores = list(((final_results.select('sentiment')).toPandas()).sentiment.apply(lambda row: (((str(row)).split(",")[3]).split("'")[1]))) op_df = p.DataFrame(list(zip(id,createdat,fulltext,favoritecount,retweetcount,pipeclean,textlen,sentscores)), columns = ['str_id','created_at','text_full','favorite_count','retweet_count','text_pipe_clean','text_length','sentiment_score']) print("\n$$$ Pandas Created") print(op_df.head(4)) return op_df
def load_pipeline(name): return PretrainedPipeline(name, lang='en')
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref,path=None): ''' # creates a list of components from a Spark NLP Pipeline reference # 1. download pipeline # 2. unpack pipeline to annotators and create list of nlu components # 3. return list of nlu components :param nlu_ref: :param language: language of the pipeline :param nlp_ref: Reference to a spark nlp petrained pipeline :param path: Load pipe from HDD :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list ''' logger.info("Starting Spark NLP to NLU pipeline conversion process") from sparknlp.pretrained import PretrainedPipeline, LightPipeline if 'language' in nlp_ref: language = 'xx' # special edge case for lang detectors if path == None : pipe = PretrainedPipeline(nlp_ref, lang=language) iterable_stages = pipe.light_model.pipeline_model.stages else : pipe = LightPipeline(PipelineModel.load(path=path)) iterable_stages = pipe.pipeline_model.stages constructed_components = [] # for component in pipe.light_model.pipeline_model.stages: for component in iterable_stages: logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component) parsed = str(component).split('_')[0].lower() logger.info("Parsed Component for : %s", parsed) c_name = component.__class__.__name__ if isinstance(component, NerConverter): constructed_components.append(Util(annotator_class='ner_converter', model=component)) elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings: constructed_components.append(nlu.Embeddings(model=component)) elif parsed in NameSpace.classifiers: constructed_components.append(nlu.Classifier(model=component)) elif isinstance(component, MultiClassifierDLModel): constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl')) elif isinstance(component, PerceptronModel): constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component)) elif isinstance(component, (ClassifierDl,ClassifierDLModel)): constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component)) elif isinstance(component, UniversalSentenceEncoder): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='use')) elif isinstance(component, BertEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert')) elif isinstance(component, AlbertEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='albert')) elif isinstance(component, XlnetEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='xlnet')) elif isinstance(component, WordEmbeddingsModel): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='glove')) elif isinstance(component, ElmoEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='elmo')) elif isinstance(component, BertSentenceEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert_sentence')) elif isinstance(component, UniversalSentenceEncoder): constructed_components.append(nlu.Embeddings(model=component, nlu_ref='use')) elif isinstance(component, TokenizerModel) and parsed != 'regex': constructed_components.append(nlu.Tokenizer(model=component)) elif isinstance(component, TokenizerModel) and parsed == 'regex' : constructed_components.append(nlu.Tokenizer(model=component, annotator_class='regex_tokenizer')) elif isinstance(component, DocumentAssembler): constructed_components.append(nlu.Util(model=component)) elif isinstance(component, SentenceDetectorDLModel): constructed_components.append(NLUSentenceDetector(annotator_class='deep_sentence_detector', model=component)) elif isinstance(component, (SentenceDetectorDLModel, SentenceDetector)): constructed_components.append(NLUSentenceDetector(annotator_class='pragmatic_sentence_detector', model=component)) elif isinstance(component, RegexMatcherModel) or parsed == 'match': constructed_components.append(nlu.Matcher(model=component, annotator_class='regex')) elif isinstance(component, TextMatcherModel): constructed_components.append(nlu.Matcher(model=component, annotator_class='text')) elif isinstance(component, DateMatcher): constructed_components.append(nlu.Matcher(model=component, annotator_class='date')) elif isinstance(component, ContextSpellCheckerModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='context')) elif isinstance(component, SymmetricDeleteModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='symmetric')) elif isinstance(component, NorvigSweetingModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='norvig')) elif isinstance(component, LemmatizerModel): constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component)) elif isinstance(component, NormalizerModel): constructed_components.append(nlu.normalizer.Normalizer(model=component)) elif isinstance(component, Stemmer): constructed_components.append(nlu.stemmer.Stemmer(model=component)) elif isinstance(component, (NerDLModel, NerCrfModel)): component.setIncludeConfidence(True) # Pipes dont always extrat confidences, so here we enable all pipes to extract confidences manually constructed_components.append(nlu.Classifier(model=component, annotator_class='ner')) elif isinstance(component, LanguageDetectorDL): constructed_components.append(nlu.Classifier(model=component, annotator_class='language_detector')) elif isinstance(component, DependencyParserModel): constructed_components.append(UnlabledDepParser(model=component)) elif isinstance(component, TypedDependencyParserModel): constructed_components.append(LabledDepParser(model=component)) elif isinstance(component, MultiClassifierDLModel): constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl')) elif isinstance(component, (SentimentDetectorModel,SentimentDLModel)): constructed_components.append(nlu.Classifier(model=component, nlp_ref='sentimentdl')) elif isinstance(component, (SentimentDetectorModel,ViveknSentimentModel)): constructed_components.append(nlu.Classifier(model=component, nlp_ref='vivekn')) elif isinstance(component, Chunker): constructed_components.append(nlu.chunker.Chunker(model=component)) elif isinstance(component, NGram): constructed_components.append(nlu.chunker.Chunker(model=component)) elif isinstance(component, ChunkEmbeddings): constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component)) elif isinstance(component, StopWordsCleaner): constructed_components.append(nlu.StopWordsCleaner(model=component)) elif isinstance(component, (TextMatcherModel, RegexMatcherModel, DateMatcher,MultiDateMatcher)) or parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) elif isinstance(component,(T5Transformer)): constructed_components.append(nlu.Seq2Seq(annotator_class='t5', model=component)) elif isinstance(component,(MarianTransformer)): constructed_components.append(nlu.Seq2Seq(annotator_class='marian', model=component)) else: logger.exception( f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} and model {component} during pipeline conversion,") logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue") constructed_components.append(nlu.normalizer.Normalizer(model=component)) logger.info(f"Extracted into NLU Component type : {parsed}", ) if None in constructed_components: logger.exception( f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} during pipeline conversion,") return None return constructed_components
def extract_sentiment(aws_conn_id: str, tweets_path: str, summary_path: str, language: str, **kwargs): aws_hook = AwsHook(aws_conn_id=aws_conn_id) aws_credentials = aws_hook.get_credentials() spark = ( SparkSession.builder.master("local[*]").appName( "Analyse sentiment of given tweets").config( "spark.serializer", "org.apache.spark.serializer.KryoSerializer").config( "spark.kryoserializer.buffer.max", "1000M") # .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.0," # "org.apache.hadoop:hadoop-common:3.2.0," # "org.apache.hadoop:hadoop-annotations:3.2.0," # "org.apache.hadoop:hadoop-auth:3.2.0," # "org.apache.hadoop:hadoop-client:3.2.0") .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5").config( "spark.hadoop.fs.s3a.access.key", aws_credentials.access_key).config( "spark.hadoop.fs.s3a.secret.key", aws_credentials.secret_key).config( "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem").config( "spark.hadoop.fs.s3a.endpoint", "s3-eu-central-1.amazonaws.com"). config("spark.hadoop.fs.s3a.path.style.access", "true").config( "spark.executor.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true").config( "spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true").getOrCreate()) year = kwargs['execution_date'].year month = kwargs['execution_date'].month day = kwargs['execution_date'].day tweets_path = f'{tweets_path}/{year:04d}/{month:02d}/{day:02d}/*.jsonl.gz' summary_path = f'{summary_path}/{year:04d}-{month:02d}-{day:02d}.jsonl' logging.info(f'Reading tweets from: {tweets_path}') tweets = spark.read.json(tweets_path) english_tweets_only = tweets.select('full_text').where( tweets.lang == language) original_english_tweets_only = english_tweets_only.where( ~english_tweets_only.full_text.startswith('RT @')) sentiment_pipeline = PretrainedPipeline('analyze_sentiment', language) analysed_tweets = sentiment_pipeline.annotate(original_english_tweets_only, column='full_text') main_sentiment = udf(lambda col: Counter(col).most_common(1)[0][0], StringType()) tweets_with_overall_sentiment = (analysed_tweets.withColumn( 'overall_sentiment', main_sentiment(analysed_tweets.sentiment.result)).drop( 'document', 'sentence', 'token', 'checked')) tweets_sentiment_summary = tweets_with_overall_sentiment.groupBy( 'overall_sentiment').count() tweets_sentiment_record = dict( tweets_sentiment_summary.rdd.map( lambda r: (r['overall_sentiment'], r['count'])).collect()) tweets_sentiment_record[ 'tweets_sentiment_id'] = f'{year:04d}-{month:02d}-{day:02d}({language})' tweets_sentiment_record['year'] = year tweets_sentiment_record['month'] = month tweets_sentiment_record['day'] = day tweets_sentiment_record['language'] = language tweets_sentiment_record['positive_count'] = tweets_sentiment_record[ 'positive'] tweets_sentiment_record['negative_count'] = tweets_sentiment_record[ 'negative'] tweets_sentiment_record['na_count'] = tweets_sentiment_record['na'] del tweets_sentiment_record['positive'] del tweets_sentiment_record['negative'] del tweets_sentiment_record['na'] logging.info( f'Extracted sentiment summary for {year:04d}-{month:02d}-{day:02d}: {tweets_sentiment_record}' ) tweets_sentiment = spark.createDataFrame([tweets_sentiment_record]) tweets_sentiment.write.json(summary_path, mode='overwrite')
import sparknlp from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql.functions import udf, col from pyspark.sql.types import IntegerType, StringType from sparknlp.pretrained import PretrainedPipeline # 1. Setup sparknlp.start() conf = SparkConf().setAppName('parallel-project') sc = SparkContext.getOrCreate() spark = SQLContext(sc) pipeline = PretrainedPipeline('analyze_sentiment', 'en') # 2. Data Cleansing # read in data to a DataFrame comments = spark.read.json('RC_2019-02-28-one-day') # dummy_data = [["Hello, world!", "/r/soccer"], ["Wow. Simply wow. What an unbelievable pass, inch perfect.", "/r/nba"]] # comments = sc.parallelize(dummy_data).toDF(['body', 'subreddit']) comments.printSchema # Rename 'body' to 'text' for spark-nlp comments = comments.withColumnRenamed('body', 'text') # keep only the columns we're interested in commentsCleaned = comments.select('subreddit', 'text') # Filter out bad comment data commentsCleaned = commentsCleaned.filter(commentsCleaned.text != '[deleted]')\ .filter(commentsCleaned.text != '[removed]')\
dfSpark.select(['id', 'tokens', 'refined_tokens']).show(10) # COMMAND ---------- dfSpark.filter(((dfSpark.publication == 'New York Times') | (dfSpark.publication == 'Vox'))) # COMMAND ---------- display(dfSpark.groupBy('publication').count()) #display(dfSpark.groupBy('year').count()) # COMMAND ---------- sparknlp.start() pipeline = PretrainedPipeline('analyze_sentiment', 'en') # COMMAND ---------- dfSpark = dfSpark.withColumn('Sentimiento', when(rand() > 0.5, 1).otherwise(0)) # COMMAND ---------- dfSpark = dfSpark.withColumn( "label", dfSpark.Sentimiento.cast('float')).drop('Sentimiento') # COMMAND ---------- dfSpark.orderBy(rand()).show(10) # COMMAND ----------