def construct_trainable_component_from_identifier(nlu_ref, nlp_ref): ''' This method returns a Spark NLP annotator Approach class embelished by a NLU component :param nlu_ref: nlu ref to the trainable model :param nlp_ref: nlp ref to the trainable model :return: trainable model as a NLU component ''' logger.info( f'Creating trainable NLU component for nlu_ref = {nlu_ref} and nlp_ref = {nlp_ref}' ) try: if nlu_ref in [ 'train.deep_sentence_detector', 'train.sentence_detector' ]: #no label col but trainable? return nlu.NLUSentenceDetector( annotator_class='deep_sentence_detector', trainable='True') if nlu_ref in ['train.context_spell', 'train.spell']: pass if nlu_ref in ['train.symmetric_spell']: pass if nlu_ref in ['train.norvig_spell']: pass if nlu_ref in ['train.unlabeled_dependency_parser']: pass if nlu_ref in ['train.labeled_dependency_parser']: pass if nlu_ref in ['train.classifier_dl', 'train.classifier']: return nlu.Classifier(annotator_class='classifier_dl', trainable=True) if nlu_ref in ['train.ner', 'train.named_entity_recognizer_dl']: return nlu.Classifier(annotator_class='ner', trainable=True) if nlu_ref in ['train.sentiment_dl', 'train.sentiment']: return nlu.Classifier(annotator_class='sentiment_dl', trainable=True) if nlu_ref in ['train.vivekn_sentiment']: pass if nlu_ref in ['train.pos']: return nlu.Classifier(annotator_class='pos', trainable=True) if nlu_ref in ['train.multi_classifier']: return nlu.Classifier(annotator_class='multi_classifier', trainable=True) if nlu_ref in ['train.word_seg', 'train.word_segmenter']: return nlu.Tokenizer(annotator_class='word_segmenter', trainable=True) except: # if reference is not in namespace and not a component it will cause a unrecoverable crash logger.exception( f'EXCEPTION: Could not create trainable NLU component for nlu_ref = {nlu_ref} and nlp_ref = {nlp_ref}' ) return None
def construct_component_from_pipe_identifier(language, sparknlp_reference): ''' # creates a list of components from a Spark NLP Pipeline reference # 1. download pipeline # 2. unpack pipeline to annotators and create list of nlu components # 3. return list of nlu components :param language: language of the pipeline :param sparknlp_reference: Reference to a spark nlp petrained pipeline :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list ''' logger.info("Starting Spark NLP to NLU pipeline conversion process") from sparknlp.pretrained import PretrainedPipeline if 'language' in sparknlp_reference : language='xx' #special edge case for lang detectors pipe = PretrainedPipeline(sparknlp_reference, lang=language) constructed_components = [] for component in pipe.light_model.pipeline_model.stages: logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component) parsed='' parsed = str(component).split('_')[0].lower() logger.info("Parsed Component for : %s", parsed) if 'NerConverter' in component.name : constructed_components.append(Util(component_name='ner_converter', model=component)) elif parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) elif parsed == 'document': constructed_components.append(nlu.Util(model=component)) elif parsed == 'sentence': constructed_components.append(nlu.Util(component_name='sentence_detector',model=component)) # todo differentiate normal and deep detector elif parsed == 'regex': constructed_components.append(nlu.Matcher(component_name='regex', model=component)) elif parsed == 'text': constructed_components.append(nlu.Matcher(model=component)) elif parsed == 'spell': constructed_components.append(nlu.SpellChecker(model=component)) elif parsed == 'lemmatizer': constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component)) elif parsed == 'normalizer': constructed_components.append(nlu.lemmatizer.Normalizer(model=component)) elif parsed == 'stemmer': constructed_components.append(nlu.stemmer.Stemmer(model=component)) elif parsed == 'pos' or parsed =='language': constructed_components.append(nlu.Classifier(model=component)) elif parsed == 'word': constructed_components.append(nlu.Embeddings(model=component)) elif parsed == 'ner' or parsed == 'nerdlmodel': constructed_components.append(nlu.Classifier(component_name='ner',model=component)) elif parsed == 'dependency': constructed_components.append(nlu.Util(model=component)) elif parsed == 'typed': constructed_components.append(nlu.Util(model=component)) # todo util abuse elif parsed == 'multi': constructed_components.append(nlu.Util(model=component)) # todo util abuse elif parsed == 'sentimentdlmodel': constructed_components.append(nlu.Classifier(model=component)) elif parsed in ['universal','bert','albert', 'elmo', 'xlnet', 'glove','electra','covidbert','small_bert',''] : constructed_components.append(nlu.Embeddings(model=component)) elif parsed == 'vivekn': constructed_components.append(nlu.Classifier(component_name='vivekn', model=component)) elif parsed == 'chunker': constructed_components.append(nlu.chunker.Chunker(model=component)) elif parsed == 'ngram': constructed_components.append(nlu.chunker.Chunker(model=component)) elif '2e2' in parsed: constructed_components.append(nlu.Embeddings(model=component)) elif parsed == 'embeddings_chunk': constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component)) elif parsed == 'stopwords': constructed_components.append(nlu.StopWordsCleaner(model=component)) logger.info("Extracted into NLU Component type : %s", parsed) if None in constructed_components : logger.exception("EXCEPTION: Could not infer component type for lang=%s and sparknlp_reference=%s during pipeline conversion,", language,sparknlp_reference) return None return constructed_components
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref,path=None): ''' # creates a list of components from a Spark NLP Pipeline reference # 1. download pipeline # 2. unpack pipeline to annotators and create list of nlu components # 3. return list of nlu components :param nlu_ref: :param language: language of the pipeline :param nlp_ref: Reference to a spark nlp petrained pipeline :param path: Load pipe from HDD :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list ''' logger.info("Starting Spark NLP to NLU pipeline conversion process") from sparknlp.pretrained import PretrainedPipeline, LightPipeline if 'language' in nlp_ref: language = 'xx' # special edge case for lang detectors if path == None : pipe = PretrainedPipeline(nlp_ref, lang=language) iterable_stages = pipe.light_model.pipeline_model.stages else : pipe = LightPipeline(PipelineModel.load(path=path)) iterable_stages = pipe.pipeline_model.stages constructed_components = [] # for component in pipe.light_model.pipeline_model.stages: for component in iterable_stages: logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component) parsed = str(component).split('_')[0].lower() logger.info("Parsed Component for : %s", parsed) c_name = component.__class__.__name__ if isinstance(component, NerConverter): constructed_components.append(Util(annotator_class='ner_converter', model=component)) elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings: constructed_components.append(nlu.Embeddings(model=component)) elif parsed in NameSpace.classifiers: constructed_components.append(nlu.Classifier(model=component)) elif isinstance(component, MultiClassifierDLModel): constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl')) elif isinstance(component, PerceptronModel): constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component)) elif isinstance(component, (ClassifierDl,ClassifierDLModel)): constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component)) elif isinstance(component, UniversalSentenceEncoder): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='use')) elif isinstance(component, BertEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert')) elif isinstance(component, AlbertEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='albert')) elif isinstance(component, XlnetEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='xlnet')) elif isinstance(component, WordEmbeddingsModel): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='glove')) elif isinstance(component, ElmoEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='elmo')) elif isinstance(component, BertSentenceEmbeddings): constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert_sentence')) elif isinstance(component, UniversalSentenceEncoder): constructed_components.append(nlu.Embeddings(model=component, nlu_ref='use')) elif isinstance(component, TokenizerModel) and parsed != 'regex': constructed_components.append(nlu.Tokenizer(model=component)) elif isinstance(component, TokenizerModel) and parsed == 'regex' : constructed_components.append(nlu.Tokenizer(model=component, annotator_class='regex_tokenizer')) elif isinstance(component, DocumentAssembler): constructed_components.append(nlu.Util(model=component)) elif isinstance(component, SentenceDetectorDLModel): constructed_components.append(NLUSentenceDetector(annotator_class='deep_sentence_detector', model=component)) elif isinstance(component, (SentenceDetectorDLModel, SentenceDetector)): constructed_components.append(NLUSentenceDetector(annotator_class='pragmatic_sentence_detector', model=component)) elif isinstance(component, RegexMatcherModel) or parsed == 'match': constructed_components.append(nlu.Matcher(model=component, annotator_class='regex')) elif isinstance(component, TextMatcherModel): constructed_components.append(nlu.Matcher(model=component, annotator_class='text')) elif isinstance(component, DateMatcher): constructed_components.append(nlu.Matcher(model=component, annotator_class='date')) elif isinstance(component, ContextSpellCheckerModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='context')) elif isinstance(component, SymmetricDeleteModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='symmetric')) elif isinstance(component, NorvigSweetingModel): constructed_components.append(nlu.SpellChecker(model=component, annotator_class='norvig')) elif isinstance(component, LemmatizerModel): constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component)) elif isinstance(component, NormalizerModel): constructed_components.append(nlu.normalizer.Normalizer(model=component)) elif isinstance(component, Stemmer): constructed_components.append(nlu.stemmer.Stemmer(model=component)) elif isinstance(component, (NerDLModel, NerCrfModel)): component.setIncludeConfidence(True) # Pipes dont always extrat confidences, so here we enable all pipes to extract confidences manually constructed_components.append(nlu.Classifier(model=component, annotator_class='ner')) elif isinstance(component, LanguageDetectorDL): constructed_components.append(nlu.Classifier(model=component, annotator_class='language_detector')) elif isinstance(component, DependencyParserModel): constructed_components.append(UnlabledDepParser(model=component)) elif isinstance(component, TypedDependencyParserModel): constructed_components.append(LabledDepParser(model=component)) elif isinstance(component, MultiClassifierDLModel): constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl')) elif isinstance(component, (SentimentDetectorModel,SentimentDLModel)): constructed_components.append(nlu.Classifier(model=component, nlp_ref='sentimentdl')) elif isinstance(component, (SentimentDetectorModel,ViveknSentimentModel)): constructed_components.append(nlu.Classifier(model=component, nlp_ref='vivekn')) elif isinstance(component, Chunker): constructed_components.append(nlu.chunker.Chunker(model=component)) elif isinstance(component, NGram): constructed_components.append(nlu.chunker.Chunker(model=component)) elif isinstance(component, ChunkEmbeddings): constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component)) elif isinstance(component, StopWordsCleaner): constructed_components.append(nlu.StopWordsCleaner(model=component)) elif isinstance(component, (TextMatcherModel, RegexMatcherModel, DateMatcher,MultiDateMatcher)) or parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) elif isinstance(component,(T5Transformer)): constructed_components.append(nlu.Seq2Seq(annotator_class='t5', model=component)) elif isinstance(component,(MarianTransformer)): constructed_components.append(nlu.Seq2Seq(annotator_class='marian', model=component)) else: logger.exception( f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} and model {component} during pipeline conversion,") logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue") constructed_components.append(nlu.normalizer.Normalizer(model=component)) logger.info(f"Extracted into NLU Component type : {parsed}", ) if None in constructed_components: logger.exception( f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} during pipeline conversion,") return None return constructed_components
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref): ''' # creates a list of components from a Spark NLP Pipeline reference # 1. download pipeline # 2. unpack pipeline to annotators and create list of nlu components # 3. return list of nlu components :param language: language of the pipeline :param nlp_ref: Reference to a spark nlp petrained pipeline :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list ''' logger.info("Starting Spark NLP to NLU pipeline conversion process") from sparknlp.pretrained import PretrainedPipeline if 'language' in nlp_ref: language = 'xx' # special edge case for lang detectors pipe = PretrainedPipeline(nlp_ref, lang=language) constructed_components = [] for component in pipe.light_model.pipeline_model.stages: logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component) parsed = str(component).split('_')[0].lower() logger.info("Parsed Component for : %s", parsed) c_name = component.__class__.__name__ if c_name == 'NerConverter': constructed_components.append(Util(annotator_class='ner_converter', model=component)) elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings: constructed_components.append(nlu.Embeddings(model=component)) elif parsed in NameSpace.classifiers: constructed_components.append(nlu.Classifier(model=component)) elif c_name == 'TokenizerModel' and parsed !='regex': constructed_components.append(nlu.Tokenizer(model=component)) elif c_name == 'TokenizerModel': constructed_components.append(nlu.Tokenizer(model=component,annotator_class='regex_tokenizer')) elif parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) elif parsed == 'document': constructed_components.append(nlu.Util(model=component)) elif parsed == 'sentence': constructed_components.append(nlu.Util(annotator_class='sentence_detector', model=component)) elif parsed == 'regex': constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed)) elif parsed == 'date': constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed)) elif parsed == 'text': constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed)) elif parsed == 'spell': constructed_components.append(nlu.SpellChecker(model=component)) elif parsed == 'lemmatizer': constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component)) elif parsed == 'normalizer': constructed_components.append(nlu.normalizer.Normalizer(model=component)) elif parsed == 'stemmer': constructed_components.append(nlu.stemmer.Stemmer(model=component)) elif c_name == 'PerceptronModel': constructed_components.append(nlu.Classifier(annotator_class='classifierdl', model=component)) elif c_name == 'ClassifierDLModel': constructed_components.append(nlu.Classifier(annotator_class='language_detector', model=component)) elif parsed == 'word': constructed_components.append(nlu.Embeddings(model=component)) elif parsed == 'ner' or parsed == 'nerdlmodel': constructed_components.append(nlu.Classifier(model=component)) elif parsed == 'dependency': constructed_components.append(nlu.Util(model=component)) elif parsed == 'typed': constructed_components.append(nlu.UnlabledDepParser(model=component)) elif parsed == 'multi': constructed_components.append(nlu.Classifier(model=component)) elif parsed == 'sentimentdlmodel': constructed_components.append(nlu.Classifier(model=component)) elif parsed == 'chunker': constructed_components.append(nlu.chunker.Chunker(model=component)) elif parsed == 'ngram': constructed_components.append(nlu.chunker.Chunker(model=component)) elif parsed == 'embeddings_chunk': constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component)) elif parsed == 'stopwords': constructed_components.append(nlu.StopWordsCleaner(model=component)) else: logger.exception( "EXCEPTION: Could not infer component type for lang=%s and nlp_ref=%s during pipeline conversion,", language, nlp_ref) logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue") constructed_components.append(nlu.normalizer.Normalizer(model=component)) logger.info("Extracted into NLU Component type : %s", parsed) if None in constructed_components: logger.exception( "EXCEPTION: Could not infer component type for lang=%s and nlp_ref=%s during pipeline conversion,", language, nlp_ref) return None return constructed_components