Esempio n. 1
0
    def enforce_AT_schema_on_embedding_processors(pipe):
        """For every embedding provider and consumer, enforce that their output col is named <output_level>@storage_ref for output_levels word,chunk,sentence aka document , i.e. word_embed@elmo or sentence_embed@elmo etc.."""
        for c in pipe.components:
            if ComponentUtils.is_embedding_provider(c):
                if '@' not in c.info.outputs[0]:
                    new_embed_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds(
                        c, 'output')
                    c.info.outputs = [new_embed_AT_ref]
                    c.info.spark_output_column_names = [new_embed_AT_ref]
                    # c.model.setOutputCol(new_embed_AT_ref[0]) # why [0] here?? bug!
                    c.model.setOutputCol(new_embed_AT_ref)

            if ComponentUtils.is_embedding_consumer(c):
                input_embed_col = ComponentUtils.extract_embed_col(c)
                if '@' not in input_embed_col:
                    # storage_ref = StorageRefUtils.extract_storage_ref(c)
                    # new_embed_col_with_AT_notation = input_embed_col+"@"+storage_ref
                    new_embed_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds(
                        c, 'input')
                    c.info.inputs.remove(input_embed_col)
                    c.info.inputs.append(new_embed_AT_ref)
                    c.info.spark_input_column_names.remove(input_embed_col)
                    c.info.spark_input_column_names.append(new_embed_AT_ref)
                    c.model.setInputCols(c.info.inputs)

        return pipe
Esempio n. 2
0
    def check_and_fix_component_order(pipe: NLUPipeline):
        '''
        This method takes care that the order of components is the correct in such a way,that the pipeline can be iteratively processed by spark NLP.
        Column Names will not be touched. DAG Task Sort basically.
        '''
        logger.info("Starting to optimize component order ")
        correct_order_component_pipeline = []
        all_components_orderd = False
        all_components = pipe.components
        provided_features = []
        update_last_type = False
        last_type_sorted = None
        trainable_updated = False
        while all_components_orderd == False:
            if update_last_type: last_type_sorted = None
            else: update_last_type = True
            for component in all_components:
                logger.info(
                    f"Optimizing order for component {component.info.name}")
                input_columns = ComponentUtils.clean_irrelevant_features(
                    component.info.spark_input_column_names, False)
                if last_type_sorted is None or component.info.type == last_type_sorted:
                    if set(input_columns).issubset(provided_features):
                        correct_order_component_pipeline.append(component)
                        if component in all_components:
                            all_components.remove(component)
                        # for feature in component.info.spark_output_column_names: provided_features.append(feature)
                        provided_features += ComponentUtils.clean_irrelevant_features(
                            component.info.spark_output_column_names, False)
                        last_type_sorted = component.info.type
                        update_last_type = False
                        break
            if len(all_components) == 0: all_components_orderd = True

            if len(
                    all_components
            ) == 1 and pipe.has_trainable_components and not trainable_updated and 'approach' in str(
                    all_components[0].model
            ).lower(
            ) and 'sentence_embeddings@' in all_components[0].info.inputs:
                # special case, if trainable then we feed embed consumers on the first sentence embed provider
                # 1. Find first sent embed provider
                # 2. substitute any 'sent_embed@' consumer inputs for the provider col
                for f in provided_features:
                    if 'sentence_embeddings' in f and not trainable_updated:
                        all_components[0].info.spark_input_column_names.remove(
                            'sentence_embeddings@')
                        if 'sentence_embeddings@' in all_components[
                                0].info.inputs:
                            all_components[0].info.inputs.remove(
                                'sentence_embeddings@')
                        all_components[0].info.spark_input_column_names.append(
                            f)
                        if f not in all_components[0].info.inputs:
                            all_components[0].info.inputs.append(f)
                        trainable_updated = True

        pipe.components = correct_order_component_pipeline

        return pipe
Esempio n. 3
0
 def enforece_AT_embedding_provider_output_col_name_schema_for_list_of_components(
         pipe_list):
     """For every embedding provider, enforce that their output col is named <output_level>@storage_ref for output_levels word,chunk,sentence aka document , i.e. word_embed@elmo or sentence_embed@elmo etc.."""
     for c in pipe_list:
         if ComponentUtils.is_embedding_provider(c):
             level_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds(
                 c, 'output')
             c.info.outputs = [level_AT_ref]
             c.info.spark_output_column_names = [level_AT_ref]
             c.model.setOutputCol(level_AT_ref[0])
     return pipe_list
Esempio n. 4
0
 def extract_required_features_refless_from_pipe(pipe: NLUPipeline):
     """Extract provided features from pipe, which have no storage ref"""
     provided_features_no_ref = []
     for c in pipe.components:
         for feat in c.info.inputs:
             if 'embed' not in feat: provided_features_no_ref.append(feat)
     return ComponentUtils.clean_irrelevant_features(
         provided_features_no_ref)
Esempio n. 5
0
    def get_missing_required_features(pipe: NLUPipeline):
        provided_features_no_ref = ComponentUtils.clean_irrelevant_features(
            PipelineQueryVerifier.extract_provided_features_refless_from_pipe(
                pipe))
        required_features_no_ref = ComponentUtils.clean_irrelevant_features(
            PipelineQueryVerifier.extract_required_features_refless_from_pipe(
                pipe))
        provided_features_ref = ComponentUtils.clean_irrelevant_features(
            PipelineQueryVerifier.extract_provided_features_ref_from_pipe(
                pipe))
        required_features_ref = ComponentUtils.clean_irrelevant_features(
            PipelineQueryVerifier.extract_required_features_ref_from_pipe(
                pipe))
        is_trainable = PipeUtils.is_trainable_pipe(pipe)
        conversion_candidates = PipelineQueryVerifier.extract_sentence_embedding_conversion_candidates(
            pipe)
        pipe.has_trainable_components = is_trainable
        if is_trainable and len(provided_features_ref) == 0:
            required_features_ref = [
            ]  # ['sentence_embedding@u'] # special case, if training we can reset this
            required_features_no_ref.append(
                'sentence_embeddings'
            )  # special case, if training we can reset this

        components_for_ner_conversion = []  # todo?

        missing_features_no_ref = set(required_features_no_ref) - set(
            provided_features_no_ref)  # - set(['text','label'])
        missing_features_ref = set(required_features_ref) - set(
            provided_features_ref)

        PipelineQueryVerifier.log_resolution_status(
            provided_features_no_ref,
            required_features_no_ref,
            provided_features_ref,
            required_features_ref,
            is_trainable,
            conversion_candidates,
            missing_features_no_ref,
            missing_features_ref,
        )
        return missing_features_no_ref, missing_features_ref, conversion_candidates
Esempio n. 6
0
    def subsitute_leaf_output_names(pipe):
        """Change all output column names of leaves to something nicer, if they not already
        use AT notation"""

        for c in pipe.components:
            if PipeUtils.is_leaf_node(
                    c, pipe) and not ComponentUtils.has_AT_notation():
                # update name
                1

        return pipe
Esempio n. 7
0
    def check_if_there_component_with_col_in_components(
            component_list, features, except_component):
        """For a given list of features and a list of components, see if there are components taht provide this feature
        If yes, True, otherwise False
        """
        for c in component_list:
            if c.info.outputs[0] != except_component.info.outputs[0]:
                for f in ComponentUtils.clean_irrelevant_features(
                        c.info.spark_output_column_names, True):
                    if f in features: return True

        return False
Esempio n. 8
0
    def extract_sentence_embedding_conversion_candidates(pipe):
        """Extract information about embedding conversion candidates"""
        conversion_candidates_data = []
        for c in pipe.components:
            if ComponentUtils.component_has_embeddings_requirement(
                    c) and not PipeUtils.is_trainable_pipe(pipe):
                storage_ref = StorageRefUtils.extract_storage_ref(c)
                conversion_applicable, conversion_data = PipelineQueryVerifier.check_if_storage_ref_is_satisfied_or_get_conversion_candidate(
                    c, pipe, storage_ref)
                if conversion_applicable:
                    conversion_candidates_data.append(conversion_data)

        return conversion_candidates_data
Esempio n. 9
0
 def extract_provided_features_ref_from_pipe(pipe: NLUPipeline):
     """Extract provided features from pipe, which have  storage ref"""
     provided_features_ref = []
     for c in pipe.components:
         for feat in c.info.outputs:
             if 'embed' in feat:
                 if '@' not in feat:
                     provided_features_ref.append(
                         feat + "@" +
                         StorageRefUtils.extract_storage_ref(c))
                 else:
                     provided_features_ref.append(feat)
     return ComponentUtils.clean_irrelevant_features(provided_features_ref)
Esempio n. 10
0
    def check_and_fix_component_output_column_name_overlap(pipe: NLUPipeline):
        '''
        This method enforces that every component has a unique output column name.
        Especially for classifiers or bert_embeddings this issue might occur,


        1. For each component we veryify that all input column names are satisfied  by checking all other components output names
        2. When a input column is missing we do the following :
        2.1 Figure out the type of the missing input column. The name of the missing column should be equal to the type
        2.2 Check if there is already a component in the pipe, which provides this input (It should)
        2.3. When the providing component is found, update its output name, or update the original coponents input name
        :return: NLU pipeline where the output and input column names of the models have been adjusted to each other
        '''

        all_names_provided = False

        for component_to_check in pipe.components:
            all_names_provided_for_component = False
            input_columns = set(
                component_to_check.info.spark_input_column_names)
            logger.info(
                f'Checking for component {component_to_check.info.name} wether input {input_columns} is satisfied by another component in the pipe'
            )
            for other_component in pipe.components:
                if component_to_check.info.name == other_component.info.name:
                    continue
                output_columns = set(
                    other_component.info.spark_output_column_names)
                input_columns -= output_columns  # set substraction

            input_columns = ComponentUtils.clean_irrelevant_features(
                input_columns)

            if len(input_columns) != 0:  # fix missing column name
                for missing_column in input_columns:
                    for other_component in pipe.components:
                        if component_to_check.info.name == other_component.info.name:
                            continue
                        if other_component.info.type == missing_column:
                            # resolve which setter to use ...
                            # We update the output name for the component which provides our feature
                            other_component.info.spark_output_column_names = [
                                missing_column
                            ]
                            logger.info(
                                f'Setting output columns for component {other_component.info.name} to {missing_column} '
                            )
                            other_component.model.setOutputCol(missing_column)

        return pipe
Esempio n. 11
0
    def is_storage_ref_match(embedding_consumer, embedding_provider, pipe):
        """Check for 2 components, if one provides the embeddings for the other. Makes sure that output_level matches up (chunk/sent/tok/embeds)"""
        consumer_AT_ref = ComponentUtils.extract_storage_ref_AT_notation_for_embeds(
            embedding_consumer, 'input')
        provider_AT_rev = ComponentUtils.extract_storage_ref_AT_notation_for_embeds(
            embedding_provider, 'output')
        consum_level = ComponentUtils.extract_embed_level_identity(
            embedding_consumer, 'input')
        provide_level = ComponentUtils.extract_embed_level_identity(
            embedding_provider, 'output')

        consumer_ref = StorageRefUtils.extract_storage_ref(embedding_consumer)
        provider_rev = StorageRefUtils.extract_storage_ref(embedding_provider)

        # input/output levels must match
        if consum_level != provide_level: return False

        # If storage ref dont match up, we must consult the storage_ref_2_embed mapping if it still maybe is a match, otherwise it is not.
        if consumer_ref == provider_rev: return True

        # Embed Components have have been resolved via@ have a  nlu_resolution_ref_source will match up with the consumer ref if correct embedding.
        if hasattr(embedding_provider.info, 'nlu_ref'):
            if consumer_ref == StorageRefUtils.extract_storage_ref(
                    embedding_provider.info.nlu_ref):
                return True

        # If it is either  sentence_embedding_converter or chunk_embedding_converter then we gotta check what the storage ref of the inpot of those is.
        # If storage ref matches up, the providers output will match the consumer
        # if embedding_provider
        if embedding_provider.info.name in [
                "chunk_embedding_converter", 'sentence_embedding_converter'
        ]:  # TODO FOR RESOLUTION
            nlu_ref, conv_prov_storage_ref = PipelineQueryVerifier.get_converters_provider_info(
                embedding_provider, pipe)

        return False
Esempio n. 12
0
    def extract_required_features_ref_from_pipe(pipe: NLUPipeline):
        """Extract provided features from pipe, which have  storage ref"""
        provided_features_ref = []
        for c in pipe.components:
            for feat in c.info.inputs:
                if 'embed' in feat:
                    # if StorageRefUtils.extract_storage_ref(c) !='':  # special edge case, some components might not have a storage ref set
                    if '@' not in feat:
                        provided_features_ref.append(
                            feat + "@" +
                            StorageRefUtils.extract_storage_ref(c))
                    else:
                        provided_features_ref.append(feat)

        return ComponentUtils.clean_irrelevant_features(provided_features_ref)
Esempio n. 13
0
    def enforce_AT_schema_on_NER_processors_and_add_missing_NER_converters(
            pipe):
        """For every NER provider and consumer, enforce that their output col is named <output_level>@storage_ref for output_levels word,chunk,sentence aka document , i.e. word_embed@elmo or sentence_embed@elmo etc..
        We also add NER converters for every NER model that no Converter converting it's inputs
        In addition, returns the pipeline with missing NER converters added, for every NER model.
        The converters transform the IOB schema in a merged and more usable form for downstream tasks
        1. Find a NER model in pipe
        2. Find a NER converter feeding from it, if there is None, create one.
        3. Generate name with Identifier  <ner-iob>@<nlu_ref_identifier>  and <entities>@<nlu_ref_identifier>
        3.1 Update NER Models    output to <ner-iob>@<nlu_ref_identifier>
        3.2 Update NER Converter input  to <ner-iob>@<nlu_ref_identifier>
        3.3 Update NER Converter output to <entities>@<nlu_ref_identifier>
        4. Update every Component that feeds from the NER converter (i.e. Resolver etc..)
        """
        from nlu import Util
        new_converters = []
        for c in pipe.components:
            if ComponentUtils.is_NER_provider(c):
                output_NER_col = ComponentUtils.extract_NER_col(c, 'output')
                converter_to_update = None
                # if '@' not in output_NER_col:
                for other_c in pipe.components:
                    if output_NER_col in other_c.info.inputs and ComponentUtils.is_NER_converter(
                            other_c):
                        converter_to_update = other_c

                ner_identifier = ComponentUtils.get_nlu_ref_identifier(c)
                if converter_to_update is None:
                    if c.info.license == 'healthcare':
                        converter_to_update = Util(
                            "ner_to_chunk_converter_licensed",
                            is_licensed=True)
                    else:
                        converter_to_update = Util("ner_to_chunk_converter")
                    new_converters.append(converter_to_update)

                converter_to_update.info.nlu_ref = f'ner_converter@{ner_identifier}'

                # 3. generate new col names
                new_NER_AT_ref = output_NER_col
                if '@' not in output_NER_col:
                    new_NER_AT_ref = output_NER_col + '@' + ner_identifier
                new_NER_converter_AT_ref = 'entities' + '@' + ner_identifier

                # 3.1 upate NER model outputs
                c.info.outputs = [new_NER_AT_ref]
                c.info.spark_output_column_names = [new_NER_AT_ref]
                c.model.setOutputCol(new_NER_AT_ref)

                #3.2 update converter inputs
                old_ner_input_col = ComponentUtils.extract_NER_converter_col(
                    converter_to_update, 'input')
                converter_to_update.info.inputs.remove(old_ner_input_col)
                converter_to_update.info.spark_input_column_names.remove(
                    old_ner_input_col)
                converter_to_update.info.inputs.append(new_NER_AT_ref)
                converter_to_update.info.spark_input_column_names.append(
                    new_NER_AT_ref)
                converter_to_update.model.setInputCols(
                    converter_to_update.info.inputs)

                #3.3 update converter outputs
                converter_to_update.info.outputs = [new_NER_converter_AT_ref]
                converter_to_update.info.spark_output_column_names = [
                    new_NER_converter_AT_ref
                ]
                converter_to_update.model.setOutputCol(
                    new_NER_converter_AT_ref)

                ## todo improve, this causes the first ner producer to feed to all ner-cosnuners. All other ner-producers will be ignored by ner-consumers,w ithouth special syntax or manual configs
                ##4. Update all NER consumers input columns
                for conversion_consumer in pipe.components:
                    if 'entities' in conversion_consumer.info.inputs:
                        conversion_consumer.info.inputs.remove('entities')
                        conversion_consumer.info.spark_input_column_names.remove(
                            'entities')
                        conversion_consumer.info.inputs.append(
                            new_NER_converter_AT_ref)
                        conversion_consumer.info.spark_input_column_names.append(
                            new_NER_converter_AT_ref)

        # Add new converters to pipe
        for conv in new_converters:
            if conv.info.license == 'healthcare':
                pipe.add(
                    conv,
                    name_to_add=
                    f'chunk_converter_licensed@{conv.info.outputs[0].split("@")[0]}'
                )
            else:
                pipe.add(
                    conv,
                    name_to_add=
                    f'chunk_converter@{conv.info.outputs[0].split("@")[0]}')
        return pipe
Esempio n. 14
0
 def is_trainable_pipe(pipe):
     '''Check if pipe is trainable'''
     for c in pipe.components:
         if ComponentUtils.is_untrained_model(c): return True
     return False
Esempio n. 15
0
    def check_and_fix_component_output_column_name_satisfaction(
            pipe: NLUPipeline):
        '''
        This function verifies that every input and output column name of a component is satisfied.
        If some output names are missing, it will be added by this methods.
        Usually classifiers need to change their input column name, so that it matches one of the previous embeddings because they have dynamic output names
        This function peforms the following steps :
        1. For each component we veryify that all input column names are satisfied  by checking all other components output names
        2. When a input column is missing we do the following :
        2.1 Figure out the type of the missing input column. The name of the missing column should be equal to the type
        2.2 Check if there is already a component in the pipe, which provides this input (It should)
        2.3. When A providing component is found, check if storage ref matches up.
        2.4 If True for all, update provider component output name, or update the original coponents input name
        :return: NLU pipeline where the output and input column names of the models have been adjusted to each other
        '''
        logger.info("Fixing input and output column names")
        # pipe = PipeUtils.enforce_AT_schema_on_pipeline(pipe)

        for component_to_check in pipe.components:
            input_columns = set(
                component_to_check.info.spark_input_column_names)
            # a component either has '' storage ref or at most 1
            logger.info(
                f'Checking for component {component_to_check.info.name} wether inputs {input_columns} is satisfied by another component in the pipe ',
            )
            for other_component in pipe.components:
                if component_to_check.info.name == other_component.info.name:
                    continue
                output_columns = set(
                    other_component.info.spark_output_column_names)
                input_columns -= output_columns  # we substract alrfready provided columns

            input_columns = ComponentUtils.clean_irrelevant_features(
                input_columns)

            # Resolve basic mismatches, usually storage refs
            if len(
                    input_columns
            ) != 0 and not pipe.has_trainable_components or ComponentUtils.is_embedding_consumer(
                    component_to_check):  # fix missing column name
                # We must not only check if input satisfied, but if storage refs match! and Match Storage_refs accordingly
                logger.info(
                    f"Fixing bad input col for C={component_to_check} untrainable pipe"
                )
                resolved_storage_ref_cols = []
                for missing_column in input_columns:
                    for other_component in pipe.components:
                        if component_to_check.info.name == other_component.info.name:
                            continue
                        if other_component.info.type == missing_column:
                            # We update the output name for the component which consumes our feature

                            if StorageRefUtils.has_storage_ref(
                                    other_component
                            ) and ComponentUtils.is_embedding_provider(
                                    component_to_check):
                                if ComponentUtils.are_producer_consumer_matches(
                                        component_to_check, other_component):
                                    resolved_storage_ref_cols.append(
                                        (other_component.info.
                                         spark_output_column_names[0],
                                         missing_column))

                            component_to_check.info.spark_output_column_names = [
                                missing_column
                            ]
                            component_to_check.info.outputs = [missing_column]
                            logger.info(
                                f'Resolved requirement for missing_column={missing_column} with inputs from provider={other_component.info.name} by col={missing_column} '
                            )
                            other_component.model.setOutputCol(missing_column)

                for resolution, unsatisfied in resolved_storage_ref_cols:
                    component_to_check.info.spark_input_column_names.remove(
                        unsatisfied)
                    component_to_check.info.spark_input_column_names.append(
                        resolution)
                component_to_check.info.inputs = component_to_check.info.spark_input_column_names

            # TODO USE is_storage_ref_match ?
            # Resolve training missatches
            elif len(
                    input_columns
            ) != 0 and pipe.has_trainable_components:  # fix missing column name
                logger.info(
                    f"Fixing bad input col for C={component_to_check} trainable pipe"
                )

                # for trainable components, we change their input columns and leave other components outputs unchanged
                for missing_column in input_columns:
                    for other_component in pipe.components:
                        if component_to_check.info.name == other_component.info.name:
                            continue
                        if other_component.info.type == missing_column:
                            # We update the input col name for the componenet that has missing cols
                            component_to_check.info.spark_input_column_names.remove(
                                missing_column)
                            # component_to_check.component_info.inputs.remove(missing_column)
                            # component_to_check.component_info.inputs.remove(missing_column)
                            # component_to_check.component_info.inputs.append(other_component.component_info.spark_output_column_names[0])

                            component_to_check.info.spark_input_column_names.append(
                                other_component.info.
                                spark_output_column_names[0])
                            component_to_check.model.setInputCols(
                                component_to_check.info.
                                spark_input_column_names)

                            logger.info(
                                f'Setting input col columns for component {component_to_check.info.name} to {other_component.info.spark_output_column_names[0]} '
                            )

        return pipe
Esempio n. 16
0
    def satisfy_dependencies(pipe: NLUPipeline) -> NLUPipeline:
        """Dependency Resolution Algorithm.
        For a given pipeline with N components, builds a DAG in reverse and satisfiy each of their dependencies and child dependencies
         with a BFS approach and returns the resulting pipeline"""
        all_features_provided = False
        is_licensed = PipelineQueryVerifier.has_licensed_components(pipe)
        pipe.has_licensed_components = is_licensed
        while all_features_provided == False:
            # After new components have been added, we must loop again and check for the new components if requriements are met
            components_to_add = []
            missing_components, missing_storage_refs, components_for_embedding_conversion = PipelineQueryVerifier.get_missing_required_features(
                pipe)
            logger.info(
                f"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            )
            logger.info(
                f"Trying to resolve missing features for \n missing_components={missing_components} \n missing storage_refs={missing_storage_refs}\n conversion_candidates={components_for_embedding_conversion}"
            )
            if PipelineQueryVerifier.check_if_all_dependencies_satisfied(
                    missing_components, missing_storage_refs,
                    components_for_embedding_conversion):
                break  # Now all features are provided

            # Create missing base storage ref producers, i.e embeddings
            for missing_component in missing_storage_refs:
                component = get_default_component_of_type(
                    missing_component,
                    language=pipe.lang,
                    is_licensed=is_licensed)
                if component is None: continue
                if 'chunk_emb' in missing_component:
                    components_to_add.append(
                        ComponentUtils.config_chunk_embed_converter(component))
                else:
                    components_to_add.append(component)

            # Create missing base components, storage refs are fetched in rpevious loop
            for missing_component in missing_components:
                components_to_add.append(
                    get_default_component_of_type(missing_component,
                                                  language=pipe.lang,
                                                  is_licensed=is_licensed))

            # Create embedding converters
            for resolution_info in components_for_embedding_conversion:
                converter = None
                if 'word2chunk' == resolution_info.type:
                    converter = PipelineQueryVerifier.add_chunk_embedding_converter(
                        resolution_info)
                elif 'word2sentence' == resolution_info.type:
                    converter = PipelineQueryVerifier.add_sentence_embedding_converter(
                        resolution_info)
                if converter is not None: components_to_add.append(converter)

            logger.info(
                f'Resolved for missing components the following NLU components : {components_to_add}'
            )

            # Add missing components
            for new_component in components_to_add:
                logger.info(f'adding {new_component.info.name}')
                pipe.add(new_component)

        logger.info(
            f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
        )
        logger.info(f"ALLL DEPENDENCIES SATISFIED")
        return pipe