def __init__(self, feature_size, self_attention_layer,
                 cross_attention_layer, feed_forward_layer, adapter_dict,
                 adapter_bottleneck_size, dropout_rate):
        super(TransformerDecoderLayer, self).__init__()
        self.feature_size = feature_size

        self.self_attention_layer = self_attention_layer
        self.cross_attention_layer = cross_attention_layer
        self.feed_forward_layer = feed_forward_layer

        adapters = nn.ModuleDict({})
        sublayer_connection_for_adapter = nn.ModuleDict({})
        _adapters = collections.OrderedDict()
        _sublayer_connection_for_adapter = collections.OrderedDict()
        for domain, size in zip(adapter_dict, adapter_bottleneck_size):
            _adapters[domain] = PositionWiseFeedForward(input_dim=feature_size,
                                                        ff_dim=size,
                                                        dropout=dropout_rate)
            _sublayer_connection_for_adapter[domain] = SublayerConnection(
                feature_size, dropout_rate)
        adapters.update(_adapters)
        sublayer_connection_for_adapter.update(
            _sublayer_connection_for_adapter)

        self.adapters = adapters
        self.sublayer_connection_for_adapter = sublayer_connection_for_adapter

        self.sublayer_with_cache = clones(
            SublayerConnectionWithCache(feature_size, dropout_rate), 2)
        self.sublayer = SublayerConnection(feature_size, dropout_rate)
    def __init__(self,
                 feature_size,
                 self_attention_layer,
                 cross_attention_layer,
                 feed_forward_layer,
                 adapter_dict,
                 adapter_bottleneck_size,
                 dropout_rate):
        super(TransformerDecoderLayer, self).__init__()
        self.feature_size = feature_size

        self.self_attention_layer = self_attention_layer
        self.cross_attention_layer = cross_attention_layer
        self.feed_forward_layer = feed_forward_layer

        self.sublayer_with_cache = clones(SublayerConnectionWithCache(feature_size, dropout_rate), 2)
        self.sublayer = SublayerConnection(feature_size, dropout_rate)

        self.domain_specific_sublayer_with_cache = None
        self.domain_specific_sublayer = None

        self.domain_specific_adapter_for_ffn = None
        self.domain_specific_adapter_for_self_attn = None
        self.domain_specific_adapter_for_cross_attn = None
        self.domain_specific_sublayer_for_self_attn_adapter = None
        self.domain_specific_sublayer_for_cross_attn_adapter = None
        self.domain_specific_sublayer_for_ffn_adapter = None
        self.init_adapter(feature_size, dropout_rate, adapter_dict, adapter_bottleneck_size)
    def __init__(self, feature_size, self_attention_layer, feed_forward_layer,
                 domain_adapter_dict, dropout_rate):
        """

        :param feature_size: the input size of each transformer encoder layer, same as output size
        :param self_attention_layer:
        :param feed_forward_layer:
        :param domain_adapter_dict:
        :param dropout_rate:
        """
        super().__init__()

        self.self_attention_layer = self_attention_layer  # sub layer 1
        self.feed_forward_layer = feed_forward_layer  # sub layer 2

        self.domain_adapter_dict = domain_adapter_dict

        # make adapter
        self.adapters = StackedAdapter(domain_adapter_dict=domain_adapter_dict,
                                       feature_size=feature_size,
                                       dropout_rate=dropout_rate)

        self.sub_layer_connections = clones(
            SublayerConnection(feature_size, dropout_rate), 2)
        self.feature_size = feature_size
Exemple #4
0
    def __init__(self, feature_size, self_attention_layer, feed_forward_layer,
                 adapter_dict, adapter_bottleneck_size, dropout_rate):
        """

        :param feature_size: the input size of each transformer encoder layer, same as output size
        :param self_attention_layer:
        :param feed_forward_layer:
        :param adapter_dict:
        :param dropout_rate:
        """
        super().__init__()

        self.self_attention_layer = self_attention_layer  # sub layer 1
        self.feed_forward_layer = feed_forward_layer  # sub layer 2

        self.adapter_dict = adapter_dict

        self.domain_specific_sublayer_connection = None
        self.domain_specific_adapter_for_ffn = None
        self.domain_specific_adapter_for_self_attn = None
        self.domain_specific_sublayer_connection_for_adapter = None
        self.init_adapter(feature_size, dropout_rate, adapter_dict,
                          adapter_bottleneck_size)

        self.sub_layer_connections = clones(
            SublayerConnection(feature_size, dropout_rate), 2)
        self.feature_size = feature_size
    def __init__(self,
                 feature_size,
                 self_attention_layer,
                 feed_forward_layer,
                 parallel_adapter_layer,
                 dropout_rate,
                 layer_norm_rescale: bool = True):
        """

        :param feature_size:
        :param self_attention_layer:
        :param feed_forward_layer:
        :param parallel_adapter_layer
        :param dropout_rate:
        """
        super().__init__()

        self.feature_size = feature_size
        self.self_attention_layer = self_attention_layer  # sub layer 1
        self.feed_forward_layer = feed_forward_layer  # sub layer 2
        self.parallel_adapter = parallel_adapter_layer
        self.sub_layer_connections = clones(
            SublayerConnection(feature_size,
                               dropout_rate,
                               layer_norm_rescale=layer_norm_rescale), 2)
Exemple #6
0
    def init_adapter(
        self,
        feature_size,
        dropout_rate,
        adapter_dict,
        adapter_bottleneck_size,
    ):

        # define two new sublayer_connection for each domain with domain-specific layer norm
        domain_specific_sublayer_connection = nn.ModuleDict({})
        _domain_specific_sublayer_connection = collections.OrderedDict()
        for domain in adapter_dict:
            _domain_specific_sublayer_connection[domain] = clones(
                SublayerConnection(feature_size, dropout_rate), 2)
        domain_specific_sublayer_connection.update(
            _domain_specific_sublayer_connection)
        self.domain_specific_sublayer_connection = domain_specific_sublayer_connection

        # define two adapter layer for each sub_layer
        domain_specific_adapter_for_self_attn = nn.ModuleDict({})
        _domain_specific_adapter_for_self_attn = collections.OrderedDict()
        domain_specific_adapter_for_ffn = nn.ModuleDict({})
        _domain_specific_adapter_for_ffn = collections.OrderedDict()
        domain_specific_sublayer_connection_for_adapter = nn.ModuleDict({})
        _domain_specific_sublayer_connection_for_adapter = collections.OrderedDict(
        )
        for domain, domain_sz in zip(adapter_dict, adapter_bottleneck_size):
            _domain_specific_adapter_for_self_attn[
                domain] = PositionWiseFeedForward(feature_size, domain_sz,
                                                  dropout_rate)
            _domain_specific_adapter_for_ffn[domain] = PositionWiseFeedForward(
                feature_size, domain_sz, dropout_rate)
            _domain_specific_sublayer_connection_for_adapter[domain] = clones(
                SublayerConnection(feature_size, dropout_rate), 2)

        domain_specific_adapter_for_self_attn.update(
            _domain_specific_adapter_for_self_attn)
        domain_specific_adapter_for_ffn.update(
            _domain_specific_adapter_for_ffn)
        domain_specific_sublayer_connection_for_adapter.update(
            _domain_specific_sublayer_connection_for_adapter)
        self.domain_specific_adapter_for_self_attn = domain_specific_adapter_for_self_attn
        self.domain_specific_adapter_for_ffn = domain_specific_adapter_for_ffn
        self.domain_specific_sublayer_connection_for_adapter = domain_specific_sublayer_connection_for_adapter
Exemple #7
0
    def __init__(self,
                 adapter_type,
                 domain_adapter_dict,
                 feature_size,
                 dropout_rate,
                 domain_list: list = None,
                 domain_inner_gate_list: list = None,
                 max_domain_num: int = None):
        """
        This module implements the Mixture-Of-Adapter Layer.
        Suppose we have some trained different domain adapter now,
        if the module contains a inner gate, then the gate will provide a mix weight,

        and the gate is optimized both translation and classify result.

        :param domain_adapter_dict:
        :param feature_size:
        :param dropout_rate:
        :param domain_list:
        :param domain_inner_gate_list:
        :param max_domain_num:
        """
        super(MixtureOfAdapterWithClassifier, self).__init__()

        # todo: add parallel
        if adapter_type != 'stack':
            return

        # for all adapter based module
        adapter_layers = nn.ModuleDict({})
        sublayer_connection_for_adapter = nn.ModuleDict({})
        for domain in domain_adapter_dict.keys():
            adapter_layers[domain] = FeedForwardAdapterLayer(input_dim=feature_size,
                                                             ff_dim=domain_adapter_dict[domain]['memory_count'],
                                                             dropout=dropout_rate)
            sublayer_connection_for_adapter[domain] = SublayerConnection(size=feature_size,
                                                                         dropout=dropout_rate)
        self.adapter_layers = adapter_layers
        self.sublayer_connection_for_adapter = sublayer_connection_for_adapter

        # for mixture of experts
        self.domain_list = domain_list
        self.domain_dict = {}
        for i, domain in enumerate(self.domain_list):
            self.domain_dict[domain] = i

        # domain mix gate
        self.max_domain_num = max_domain_num
        self.domain_inner_gate_list = domain_inner_gate_list
        inner_gate = nn.ModuleDict({})
        for domain in domain_inner_gate_list:
            inner_gate[domain] = nn.Sequential(
                nn.Linear(in_features=feature_size, out_features=max_domain_num),
                nn.ReLU(),
                nn.Linear(in_features=max_domain_num, out_features=max_domain_num))
        self.inner_gate = inner_gate
    def __init__(self,
                 feature_size,
                 self_attention_layer,
                 cross_attention_layer,
                 feed_forward_layer,
                 dropout_rate,
                 adapter_setting,
                 domain_adapter_dict: dict = None,
                 domain_list: list = None,
                 max_domain_num: int = 0,
                 domain_inner_gate_list: list = None):

        super(TransformerDecoderLayerWithAdapter, self).__init__()

        self.feature_size = feature_size
        self.self_attention_layer = self_attention_layer
        self.cross_attention_layer = cross_attention_layer
        self.feed_forward_layer = feed_forward_layer
        self.sublayer_with_cache = clones(
            SublayerConnectionWithCache(feature_size, dropout_rate), 2)
        self.sublayer = SublayerConnection(feature_size, dropout_rate)

        # make adapter
        self.domain_adapter_dict = domain_adapter_dict
        self.adapter_type = adapter_setting['type']
        self.adapter_fusion = adapter_setting['fusion']
        self.domain_list = domain_list
        self.max_domain_num = max_domain_num

        if self.adapter_fusion == 'mix':
            self.adapters = MixtureOfAdapter(
                adapter_type=self.adapter_type,
                domain_adapter_dict=domain_adapter_dict,
                feature_size=feature_size,
                dropout_rate=dropout_rate,
                domain_list=domain_list,
                domain_inner_gate_list=domain_inner_gate_list,
                max_domain_num=max_domain_num)

        else:
            if self.adapter_type == 'stack':

                self.adapters = StackedAdapter(
                    domain_adapter_dict=domain_adapter_dict,
                    feature_size=feature_size,
                    dropout_rate=dropout_rate)

            elif self.adapter_type == 'parallel':
                self.adapters = ParallelAdapter(
                    domain_adapter_dict=domain_adapter_dict,
                    feature_size=feature_size,
                    dropout_rate=dropout_rate,
                    max_domain_num=adapter_setting['max_domain_num'],
                    domain_idx_dict=adapter_setting['domain_idx_dict'],
                )
Exemple #9
0
    def __init__(self, feature_size, self_attention_layer,
                 cross_attention_layer, feed_forward_layer,
                 domain_adapter_dict, dropout_rate):
        super(TransformerDecoderLayerWithClassifierAdapter, self).__init__()
        self.feature_size = feature_size

        self.self_attention_layer = self_attention_layer
        self.cross_attention_layer = cross_attention_layer
        self.feed_forward_layer = feed_forward_layer

        self.adapter = StackedAdapter(domain_adapter_dict, feature_size,
                                      dropout_rate)

        self.sublayer_with_cache = clones(
            SublayerConnectionWithCache(feature_size, dropout_rate), 2)
        self.sublayer = SublayerConnection(feature_size, dropout_rate)
    def __init__(
        self,
        feature_size,
        self_attention_layer,
        feed_forward_layer,
        dropout_rate,
        adapter_setting,
        domain_adapter_dict: dict = None,
        domain_list: list = None,
        max_domain_num: int = 0,
        domain_inner_gate_list: list = None,
    ):
        """

        :param feature_size: the input size of each transformer encoder layer, same as output size
        :param self_attention_layer:
        :param feed_forward_layer:
        :param domain_adapter_dict:
        :param dropout_rate:
        """
        super().__init__()

        self.self_attention_layer = self_attention_layer  # sub layer 1
        self.feed_forward_layer = feed_forward_layer  # sub layer 2
        self.sub_layer_connections = clones(
            SublayerConnection(feature_size, dropout_rate), 2)
        self.feature_size = feature_size

        # make adapter
        self.domain_adapter_dict = domain_adapter_dict
        self.adapter_type = adapter_setting['type']
        self.adapter_fusion = adapter_setting['fusion']
        self.domain_list = domain_list
        self.max_domain_num = max_domain_num

        self.adapters = MixtureOfAdapterWithClassifier(
            adapter_type=self.adapter_type,
            domain_adapter_dict=domain_adapter_dict,
            feature_size=feature_size,
            dropout_rate=dropout_rate,
            domain_list=domain_list,
            domain_inner_gate_list=domain_inner_gate_list,
            max_domain_num=max_domain_num)
Exemple #11
0
    def __init__(self,
                 feature_size: int,
                 self_attention_layer,
                 cross_attention_layer,
                 feed_forward_layer,
                 dropout_rate: float,
                 layer_norm_rescale: bool = True):

        super(TransformerDecoderLayer, self).__init__()
        self.feature_size = feature_size
        self.self_attention_layer = self_attention_layer
        self.cross_attention_layer = cross_attention_layer
        self.feed_forward_layer = feed_forward_layer

        self.sublayer_with_cache = clones(
            SublayerConnectionWithCache(feature_size, dropout_rate,
                                        layer_norm_rescale), 2)
        self.sublayer = SublayerConnection(feature_size, dropout_rate,
                                           layer_norm_rescale)
Exemple #12
0
    def __init__(
        self,
        domain_adapter_dict,
        feature_size,
        dropout_rate,
    ):
        super(StackedAdapter, self).__init__()

        adapter_layers = nn.ModuleDict({})
        sublayer_connection_for_adapter = nn.ModuleDict({})

        for domain in domain_adapter_dict.keys():
            adapter_layers[domain] = FeedForwardAdapterLayer(
                input_dim=feature_size,
                ff_dim=domain_adapter_dict[domain]['memory_count'],
                dropout=dropout_rate)
            sublayer_connection_for_adapter[domain] = SublayerConnection(
                size=feature_size, dropout=dropout_rate)

        self.adapter_layers = adapter_layers
        self.sublayer_connection_for_adapter = sublayer_connection_for_adapter
    def __init__(
        self,
        feature_size,
        self_attention_layer,
        cross_attention_layer,
        feed_forward_layer,
        adapters,
        dropout_rate,
    ):
        super(TransformerDecoderLayerWithMixAdapter, self).__init__()

        self.feature_size = feature_size
        self.self_attention_layer = self_attention_layer
        self.cross_attention_layer = cross_attention_layer
        self.feed_forward_layer = feed_forward_layer
        self.sublayer_with_cache = clones(
            SublayerConnectionWithCache(feature_size, dropout_rate), 2)
        self.sublayer = SublayerConnection(feature_size, dropout_rate)

        # make adapter
        self.adapters = adapters
Exemple #14
0
    def __init__(self, domain_adapter_dict, feature_size, dropout_rate):
        super(StackedMultiHeadAdapter, self).__init__()

        adapter_layers = nn.ModuleDict({})
        _adapter_layers = collections.OrderedDict()
        sublayer_connection_for_adapter = nn.ModuleDict({})
        _sublayer_connection_for_adapter = collections.OrderedDict()

        for domain in domain_adapter_dict.keys():
            _adapter_layers[domain] = MultiHeadFeedForwardAdapterLayer(
                input_dim=feature_size,
                memory_count=domain_adapter_dict[domain]['memory_count'],
                head_num=domain_adapter_dict[domain]['head_num'],
                dropout=dropout_rate)
            _sublayer_connection_for_adapter[domain] = SublayerConnection(
                size=feature_size, dropout=dropout_rate)

        adapter_layers.update(_adapter_layers)
        sublayer_connection_for_adapter.update(
            _sublayer_connection_for_adapter)

        self.adapter_layers = adapter_layers
        self.sublayer_connection_for_adapter = sublayer_connection_for_adapter
    def __init__(self,
                 domain_adapter_dict,
                 feature_size,
                 dropout_rate,
                 domain_list: list = None,
                 # domain_inner_gate_list: list = None,
                 # gate_activate_func='sigmoid',
                 # stack_between_adapter_and_experts=False,
                 # domain_classifier_dict=None
                 ):
        """
        This module implements the Mixture-Of-Adapter Layer.

        :param domain_adapter_dict:
        :param feature_size:
        :param dropout_rate:
        :param domain_list:
        :param domain_inner_gate_list:
        """
        super(MixtureOfAdapter, self).__init__()

        # initialization of all adapter based module
        self.adapter_types = {}
        adapter_layers = nn.ModuleDict({})
        domain_mix_layers = nn.ModuleDict({})
        sublayer_connection_for_adapter = nn.ModuleDict({})

        for domain in domain_adapter_dict.keys():

            if domain_adapter_dict[domain].get('adapter_type', None) == 'memory':

                adapter_layers[domain] = MemoryAdapterLayer(input_dim=feature_size,
                                                            ff_dim=domain_adapter_dict[domain]['memory_count'],
                                                            dropout=dropout_rate)

            elif domain_adapter_dict[domain].get('adapter_type', None) == 'domain_mix':

                domain_mix_layers[domain] = AdapterMixLayer(used_adapters=domain_adapter_dict[domain]['used_adapters'],
                                                            feature_size=feature_size,
                                                            dropout_rate=dropout_rate,
                                                            classifier_dict=domain_adapter_dict[domain]['classifier_dict'])

            elif domain_adapter_dict[domain].get('is_generate', False):
                adapter_layers[domain] = ParameterGeneratorForAdapter(adapter_dict=domain_adapter_dict,
                                                                      used_adapters=domain_adapter_dict[domain]['used_adapters'],
                                                                      generate_dim=domain_adapter_dict[domain]['generate_dim'],
                                                                      feature_size=feature_size,
                                                                      bottleneck_dim=domain_adapter_dict[domain]['bottle_neck_dim'],
                                                                      dropout_rate=dropout_rate,
                                                                      linear_transform=domain_adapter_dict[domain].get('linear_transform', False))
                self.adapter_types[domain] = 'generate'
                sublayer_connection_for_adapter[domain] = SublayerConnection(size=feature_size,
                                                                             dropout=dropout_rate)

            else:
                adapter_layers[domain] = FeedForwardAdapterLayer(input_dim=feature_size,
                                                                 ff_dim=domain_adapter_dict[domain]['memory_count'],
                                                                 dropout=dropout_rate,
                                                                 activation_statistic=domain_adapter_dict[domain]['activation_statistic'] if 'activation_statistic' in domain_adapter_dict[domain]
                                                                 else False)
                self.adapter_types[domain] = 'simple'

                sublayer_connection_for_adapter[domain] = SublayerConnection(size=feature_size,
                                                                             dropout=dropout_rate)

        self.adapter_layers = adapter_layers
        self.sublayer_connection_for_adapter = sublayer_connection_for_adapter
        self.domain_mix_layers = domain_mix_layers
    def init_adapter(self, feature_size, dropout_rate, adapter_dict, adapter_bottleneck_size):

        domain_specific_sublayer_with_cache = nn.ModuleDict({})
        _domain_specific_sublayer_with_cache = collections.OrderedDict()
        for domain in adapter_dict:
            _domain_specific_sublayer_with_cache[domain] = clones(
                SublayerConnectionWithCache(feature_size, dropout_rate), 2)
        domain_specific_sublayer_with_cache.update(_domain_specific_sublayer_with_cache)
        self.domain_specific_sublayer_with_cache = domain_specific_sublayer_with_cache

        domain_specific_sublayer = nn.ModuleDict({})
        _domain_specific_sublayer = collections.OrderedDict()
        for domain in adapter_dict:
            _domain_specific_sublayer[domain] = SublayerConnection(feature_size, dropout_rate)
        domain_specific_sublayer.update(_domain_specific_sublayer)
        self.domain_specific_sublayer = domain_specific_sublayer

        # define three adapter for each sub layer

        domain_specific_adapter_for_self_attn = nn.ModuleDict({})
        _domain_specific_adapter_for_self_attn = collections.OrderedDict()
        domain_specific_adapter_for_cross_attn = nn.ModuleDict({})
        _domain_specific_adapter_for_cross_attn = collections.OrderedDict()
        domain_specific_adapter_for_ffn = nn.ModuleDict({})
        _domain_specific_adapter_for_ffn = collections.OrderedDict()

        domain_specific_sublayer_for_self_attn_adapter = nn.ModuleDict({})
        domain_specific_sublayer_for_cross_attn_adapter = nn.ModuleDict({})
        domain_specific_sublayer_for_ffn_adapter = nn.ModuleDict({})
        _domain_specific_sublayer_for_self_attn_adapter = collections.OrderedDict()
        _domain_specific_sublayer_for_cross_attn_adapter = collections.OrderedDict()
        _domain_specific_sublayer_for_ffn_adapter = collections.OrderedDict()

        for domain, domain_sz in zip(adapter_dict, adapter_bottleneck_size):
            _domain_specific_adapter_for_self_attn[domain] = PositionWiseFeedForward(feature_size,
                                                                                     domain_sz,
                                                                                     dropout_rate)
            _domain_specific_adapter_for_cross_attn[domain] = PositionWiseFeedForward(feature_size,
                                                                                      domain_sz,
                                                                                      dropout_rate)
            _domain_specific_adapter_for_ffn[domain] = PositionWiseFeedForward(feature_size,
                                                                               domain_sz,
                                                                               dropout_rate)

            _domain_specific_sublayer_for_self_attn_adapter[domain] = SublayerConnectionWithCache(feature_size, dropout_rate)
            _domain_specific_sublayer_for_cross_attn_adapter[domain] = SublayerConnectionWithCache(feature_size, dropout_rate)
            _domain_specific_sublayer_for_ffn_adapter[domain] = SublayerConnection(feature_size, dropout_rate)

        domain_specific_adapter_for_self_attn.update(_domain_specific_adapter_for_self_attn)
        domain_specific_adapter_for_cross_attn.update(_domain_specific_adapter_for_cross_attn)
        domain_specific_adapter_for_ffn.update(_domain_specific_adapter_for_ffn)
        domain_specific_sublayer_for_self_attn_adapter.update(_domain_specific_sublayer_for_self_attn_adapter)
        domain_specific_sublayer_for_cross_attn_adapter.update(_domain_specific_sublayer_for_cross_attn_adapter)
        domain_specific_sublayer_for_ffn_adapter.update(_domain_specific_sublayer_for_ffn_adapter)

        self.domain_specific_adapter_for_self_attn = domain_specific_adapter_for_self_attn
        self.domain_specific_adapter_for_ffn = domain_specific_adapter_for_ffn
        self.domain_specific_adapter_for_cross_attn = domain_specific_adapter_for_cross_attn
        self.domain_specific_sublayer_for_self_attn_adapter = domain_specific_sublayer_for_self_attn_adapter
        self.domain_specific_sublayer_for_cross_attn_adapter = domain_specific_sublayer_for_cross_attn_adapter
        self.domain_specific_sublayer_for_ffn_adapter = domain_specific_sublayer_for_ffn_adapter