def construct_component_from_identifier(language, component_type='', dataset='', component_embeddings='', nlu_ref='', nlp_ref=''): ''' Creates a NLU component from a pretrained SparkNLP model reference or Class reference. Class references will return default pretrained models :param language: Language of the sparknlp model reference :param component_type: Class which will be used to instantiate the model :param dataset: Dataset that the model was trained on :param component_embeddings: Embedded that the models was traiend on (if any) :param nlu_ref: Full user request :param nlp_ref: Full Spark NLP reference :return: Returns a NLU component which embelished the Spark NLP pretrained model and class for that model ''' logger.info('Creating singular NLU component for type=%s sparknlp_ref=%s , dataset=%s, language=%s , nlu_ref=%s ', component_type, nlp_ref, dataset, language, nlu_ref) try: if any( x in NameSpace.seq2seq for x in [nlp_ref, nlu_ref, dataset, component_type, ]): return Seq2Seq(annotator_class=component_type, language=language, get_default=False, nlp_ref=nlp_ref,configs=dataset) # if any([component_type in NameSpace.word_embeddings,dataset in NameSpace.word_embeddings, nlu_ref in NameSpace.word_embeddings, nlp_ref in NameSpace.word_embeddings]): elif any(x in NameSpace.word_embeddings and not x in NameSpace.classifiers for x in [nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')): return Embeddings(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language) # elif any([component_type in NameSpace.sentence_embeddings,dataset in NameSpace.sentence_embeddings, nlu_ref in NameSpace.sentence_embeddings, nlp_ref in NameSpace.sentence_embeddings]): if any(x in NameSpace.sentence_embeddings and not x in NameSpace.classifiers for x in [nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')): return Embeddings(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language) elif any( x in NameSpace.classifiers for x in [nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')): return Classifier(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language) elif any('spell' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return SpellChecker(annotator_class=component_type, language=language, get_default=True, nlp_ref=nlp_ref, dataset=dataset) elif any('dep' in x and not 'untyped' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return LabledDepParser() elif any('dep.untyped' in x or 'untyped' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return UnlabledDepParser() elif any('lemma' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.lemmatizer.Lemmatizer(language=language, nlp_ref=nlp_ref) elif any('norm' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.normalizer.Normalizer(nlp_ref=nlp_ref, nlu_ref=nlu_ref) elif any('clean' in x or 'stopword' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.StopWordsCleaner(language=language, get_default=False, nlp_ref=nlp_ref) elif any('sentence_detector' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return NLUSentenceDetector(nlu_ref=nlu_ref, nlp_ref=nlp_ref, language=language) elif any('match' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return Matcher(nlu_ref=nlu_ref, nlp_ref=nlp_ref) # THIS NEEDS TO CAPTURE THE WORD SEGMNETER!!! elif any('tokenize' in x or 'segment_words' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.tokenizer.Tokenizer(nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language,get_default=False) elif any('stem' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return Stemmer() # supported in future version with auto embed generation # elif any('embed_chunk' in x for x in [nlp_ref, nlu_ref, dataset, component_type] ): # return embeddings_chunker.EmbeddingsChunker() elif any('chunk' in x for x in [nlp_ref, nlu_ref, dataset, component_type]): return nlu.chunker.Chunker() elif component_type == 'ngram': return nlu.chunker.Chunker('ngram') logger.exception('EXCEPTION: Could not resolve singular Component for type=%s and nlp_ref=%s and nlu_ref=%s', component_type, nlp_ref, nlu_ref) return None except: # if reference is not in namespace and not a component it will cause a unrecoverable crash logger.exception('EXCEPTION: Could not resolve singular Component for type=%s and nlp_ref=%s and nlu_ref=%s', component_type, nlp_ref, nlu_ref) return None
def construct_component_from_identifier(language, component_type, dataset, component_embeddings, nlu_reference, sparknlp_reference): ''' Creates a NLU component from a pretrained SparkNLP model reference or Class reference. Class references will return default pretrained models :param language: Language of the sparknlp model reference :param component_type: Class which will be used to instantiate the model :param dataset: Dataset that the model was trained on :param component_embeddings: Embedded that the models was traiend on (if any) :param nlu_reference: Full user request :param sparknlp_reference: Full Spark NLP reference :return: Returns a NLU component which embelished the Spark NLP pretrained model and class for that model ''' logger.info('Creating singular NLU component for type=%s sparknlp reference=%s , dataset=%s, language=%s ', component_type, sparknlp_reference, dataset, language) try : if sparknlp_reference == 'yake': return Classifier('yake') elif 'bert' in dataset or component_type == 'embed' or 'albert' in component_type or 'bert' in component_type or 'xlnet' in component_type or 'use' in component_type or 'glove' in component_type or 'elmo' in component_type or 'tfhub_use' in sparknlp_reference\ or 'bert' in sparknlp_reference or 'labse' in sparknlp_reference or component_type =='embed_sentence' or 'electra' in nlu_reference: if component_type == 'embed' and dataset != '' : return Embeddings(component_name=dataset, language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'embed' : return Embeddings(component_name=sparknlp_reference) #default else : return Embeddings(component_name=component_type, language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'classify' or 'e2e' in sparknlp_reference: if component_type == 'classify' and dataset != '' : return Classifier(component_name=dataset, language=language, get_default=False, sparknlp_reference=sparknlp_reference) else : return Classifier(component_name=component_type, language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'tokenize': return nlu.tokenizer.Tokenizer(component_name=component_type, language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'pos': return Classifier(component_name=component_type, language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'ner' or 'ner_dl' in sparknlp_reference: return Classifier(component_name='ner', language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'sentiment': return Classifier(component_name=component_type, language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'emotion': return Classifier(component_name=component_type, language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'spell': return SpellChecker(component_name=component_type, language=language, get_default=False, sparknlp_reference=sparknlp_reference, dataset = dataset) elif component_type == 'dep' and dataset!='untyped' :# There are no trainable dep parsers this gets only default dep return LabledDepParser(component_name='labeled_dependency_parser', language=language, get_default=True, sparknlp_reference=sparknlp_reference) elif component_type == 'dep.untyped' or dataset =='untyped': # There are no trainable dep parsers this gets only default dep return UnlabledDepParser(component_name='unlabeled_dependency_parser', language=language, get_default=True, sparknlp_reference=sparknlp_reference) elif component_type == 'lemma': return nlu.lemmatizer.Lemmatizer(component_name=component_type, language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'norm': return nlu.normalizer.Normalizer(component_name='normalizer', language=language, get_default=True, sparknlp_reference=sparknlp_reference) elif component_type == 'clean' or component_type == 'stopwords' : return nlu.StopWordsCleaner( language=language, get_default=False, sparknlp_reference=sparknlp_reference) elif component_type == 'sentence_detector': return NLUSentenceDetector(component_name=component_type, language=language, get_default=True, sparknlp_reference=sparknlp_reference) elif component_type == 'match': return Matcher(component_name=dataset, language=language, get_default=True, sparknlp_reference=sparknlp_reference) elif component_type == 'stem' or component_type == 'stemm' or sparknlp_reference == 'stemmer' : return Stemmer() elif component_type == 'chunk' :return nlu.chunker.Chunker() elif component_type == 'ngram' :return nlu.chunker.Chunker('ngram') elif component_type == 'embed_chunk': return embeddings_chunker.EmbeddingsChunker() elif component_type == 'regex' or sparknlp_reference =='regex_matcher' : return nlu.Matcher(component_name='regex') elif component_type == 'text' or sparknlp_reference =='text_matcher' : return nlu.Matcher(component_name='text') logger.exception('EXCEPTION: Could not resolve singular Component for type=%s and sparknl reference=%s and nlu reference=%s', component_type, sparknlp_reference, nlu_reference) return None except : # if reference is not in namespace and not a component it will cause a unrecoverable crash logger.exception('EXCEPTION: Could not resolve singular Component for type=%s and sparknl reference=%s and nlu reference=%s', component_type, sparknlp_reference, nlu_reference) return None