Python PKMRouter Beispiele

Programmiersprache: Python

Namespace / Paketname: routers.pkm

Klasse / Typ: PKMRouter

Beispiele auf hotexamples.com: 20

Python PKMRouter - 20 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die routers.pkm.PKMRouter, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

PKMRouter(6)

add_args(6)

forward_text(4)

check_config(2)

forward_image(2)

Beispiel #1

Datei anzeigen

    def __init__(self, config: Munch):
        r""" Init a new GPT2 synapse module.

            Args:
                config (:obj:`munch.Munch`, `required`): 
                    munched config class.
        """
        super(GPT2LMSynapse, self).__init__(config=config)
        if config == None:
            config = GPT2LMSynapse.build_config()

        # Build hugging face config.
        huggingface_config = GPT2Config(
            vocab_size=bittensor.__vocab_size__,
            n_embd=bittensor.__network_dim__,
            n_layer=config.synapse.n_layer,
            n_head=config.synapse.n_head,
            n_inner=config.synapse.n_inner,
            activation_function=config.synapse.activation_function,
            resid_pdrop=config.synapse.resid_pdrop,
            embd_pdrop=config.synapse.embd_pdrop,
            attn_pdrop=config.synapse.attn_pdrop,
            layer_norm_epsilon=config.synapse.layer_norm_epsilon,
            initializer_range=config.synapse.initializer_range,
            summary_type=config.synapse.summary_type,
            summary_use_proj=config.synapse.summary_use_proj,
            summary_activation=config.synapse.summary_activation,
            summary_proj_to_labels=config.synapse.summary_proj_to_labels,
            summary_first_dropout=config.synapse.summary_first_dropout,
        )

        # encoder_layer: encodes tokenized sequences to network dim.
        # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.transformer = GPT2Model(huggingface_config)

        # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query.
        # [batch_size, bittensor.__network_dim__, sequence_len] -> [batch_size, bittensor.__network_dim__]
        self.pooler = GPT2Pooler(huggingface_config)

        # router: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, bittensor.__network_dim__] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # hidden_layer: transforms context and encoding to network_dim hidden units.
        # [batch_size, sequence_dim, 2 * bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # target_layer: maps from hidden layer to vocab dimension for each token. Used by MLM loss.
        # [batch_size, sequence_len, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__vocab_size__]
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__vocab_size__,
                                      bias=False)

        # Loss function: MLM cross-entropy loss.
        # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1]
        self.loss_fct = nn.CrossEntropyLoss()

        self.to(self.device)

Beispiel #2

Datei anzeigen

Datei: dpn.py Projekt: il-dar/bittensor

    def __init__( self, config: Munch = None):
        r""" Init a new DPN synapse module.

            Args:
                config (:obj: `munch.Munch`, `required`)
                    munch namespace config item.
        """
        super(DPNSynapse, self).__init__(config = config)
        if config == None:
            config = DPNSynapse.build_config()
        
        in_planes, out_planes = config.synapse.in_planes, config.synapse.out_planes
        num_blocks, dense_depth = config.synapse.num_blocks, config.synapse.dense_depth

        # Transform Network
        """ Transform network.
                Layers take in image inputs normalizes them and applies 
                4 convolutional layers. 
            Image encoder: transforms PIL-encoded tensors to a common shape.
            [batch_size, channels, rows, cols] -> [batch_size, -1, -1, -1] 

            Output: [batch_size, self.transform_dim (9728)]
        """
        self.transform = Normalize((0.1307,), (0.3081,), device=self.device)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((32, 32))
        self.transform_conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.transform_bn1 = nn.BatchNorm2d(64)
        self.last_planes = 64
        self.transform_layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
        self.transform_layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
        self.transform_layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=1)
        self.transform_layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
        self.transform_dim = (out_planes[3] * 4)+(((num_blocks[3]+1) * 4)*dense_depth[3])
        
        # dendrite: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim = self.transform_dim)

        # Context layers.
        """
            Distillation model for remote context. This layer takes input 
            coming from transform layer, and runs it through 3 linear layers,
            projecting it to bittensor.__network_dim__.  
        """
        self.context_layer1 = nn.Linear(self.transform_dim, 512)
        self.context_layer2 = nn.Linear(512, 256)
        self.context_layer3 = nn.Linear(256, bittensor.__network_dim__)

        # hidden layer.
        self.hidden_layer1 = nn.Linear(self.transform_dim + bittensor.__network_dim__, 512)
        self.hidden_layer2 = nn.Linear(512, 256)
        self.hidden_layer3 = nn.Linear(256, bittensor.__network_dim__)

        # Layers to project target down to target size passed by config
        # (number of classes)
        self.target_layer1 = nn.Linear(bittensor.__network_dim__, 128)
        self.target_layer2 = nn.Linear(128, self.config.synapse.target_dim)

        self.to(self.device)

Beispiel #3

Datei anzeigen

 def add_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         '--synapse.target_dim',
         default=10,
         type=int,
         help='Final logit layer dimension. i.e. 10 for MNIST.')
     parser = PKMRouter.add_args(parser)

Beispiel #4

Datei anzeigen

    def __init__(self, config: Munch, **kwargs):
        r""" Init a new ffnn synapse module.
                :param [config]: munch namespace config item.
                :type [config]:  [:obj:`munch.Munch`](, `required`)

        """
        super(FFNNSynapse, self).__init__(config=config, **kwargs)
        if config == None:
            config = FFNNSynapse.default_config()
        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        FFNNSynapse.check_config(config)
        self.config = config

        # transform_layer: transforms images to common dimension.
        # [batch_size, -1, -1, -1] -> [batch_size, self.transform_dim]
        self.transform = Normalize((0.1307, ), (0.3081, ), device=self.device)
        self.transform_pool = nn.AdaptiveAvgPool2d((28, 28))
        self.transform_conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.transform_conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.transform_drop = nn.Dropout2d()
        self.transform_dim = 320

        # context_layer: distills the remote_context from the transform layer.
        # [batch_size, transform_dim] -> [batch_size, bittensor.__network_dim__]
        self.context_layer1 = nn.Linear(self.transform_dim, 256)
        self.context_layer2 = nn.Linear(256, bittensor.__network_dim__)

        # hidden_layer: learns hidden units for network and target.
        # [batch_size, transform_dim + bittensor.__network_dim__] = [batch_size, bittensor.__network_dim__]
        self.hidden_layer1 = nn.Linear(
            self.transform_dim + bittensor.__network_dim__,
            bittensor.__network_dim__)
        self.hidden_layer2 = nn.Linear(bittensor.__network_dim__,
                                       bittensor.__network_dim__)

        # dendrite: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # target_layer: Maps from hidden layer to target dimension
        # [batch_size, bittensor.__network_dim__] -> [batch_size, self.target_dim]
        self.target_layer1 = nn.Linear(bittensor.__network_dim__, 256)
        self.target_layer2 = nn.Linear(256, self.config.synapse.target_dim)

        self.to(self.device)

Beispiel #5

Datei anzeigen

Datei: xlm.py Projekt: timolegros/bittensor

    def __init__(self, config: Munch = None, **kwargs):
        """ Initialize a new XLM synapse module.

        Args:
            config (:obj:`munch.Munch`, `required`): 
                    munched config class.
        """
        super(XLMSynapse, self).__init__(config=config, **kwargs)
        if config == None:
            config = XLMSynapse.default_config()
        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        XLMSynapse.check_config(config)
        self.config = config

        # Build config.
        xlm_config = XLMConfig(
            vocab_size=bittensor.__vocab_size__,
            emb_dim=bittensor.__network_dim__,
            n_layers=config.synapse.n_layers,
            n_heads=config.synapse.n_heads,
            # More needed
        )

        # model layer: encodes tokenized sequences to network dim.
        self.xlm = XLMModel(xlm_config)

        # pooler layer: pools the hidden units for use by the pkm dendrite rpc query.
        self.pooler = XLMPooler(xlm_config)

        # router: (PKM layer) queries network using embeddings as context
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # hidden layer: transforms context and encoding to network dimension hidden units.
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # target layer: maps from hidden layer to vocab dimension for each token.
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__vocab_size__,
                                      bias=False)

        # Loss function
        self.loss_fct = nn.CrossEntropyLoss()

        self.to(self.device)

Beispiel #6

Datei anzeigen

Datei: bert.py Projekt: timolegros/bittensor

    def __init__(self, config: Munch, **kwargs):
        r""" Init a new base-bert synapse.

            Args:
                config (:obj:`munch.Munch`, `required`): 
        """
        super(BertSynapseBase, self).__init__(config=config, **kwargs)
        if config == None:
            config = BertSynapseBase.default_config()
        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        BertSynapseBase.check_config(config)
        self.config = config

        # Hugging face config item.
        huggingface_config = BertConfig(
            vocab_size=bittensor.__vocab_size__,
            hidden_size=bittensor.__network_dim__,
            num_hidden_layers=config.synapse.num_hidden_layers,
            num_attention_heads=config.synapse.num_attention_heads,
            intermediate_size=bittensor.__network_dim__,
            is_decoder=False)

        # dendrite: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # encoder_layer: encodes tokenized sequences to network dim.
        # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.transformer = BertModel(huggingface_config,
                                     add_pooling_layer=True)

        # hidden_layer: transforms context and encoding to network_dim hidden units.
        # [batch_size, sequence_dim, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.hidden_layer = torch.nn.Linear(bittensor.__network_dim__,
                                            bittensor.__network_dim__)

        # pooling_layer: transforms teh hidden layer into a pooled representation by taking the encoding of the first token
        # [batch_size, sequence_dim,  bittensor.__network_dim__] -> [batch_size, bittensor.__network_dim__]
        self.pooler = BertPooler(huggingface_config)

        self.to(self.device)

Beispiel #7

Datei anzeigen

Datei: bert.py Projekt: timolegros/bittensor

 def add_args(parser: argparse.ArgumentParser):
     r""" Add custom params to the parser.
     """
     parser.add_argument(
         '--synapse.num_hidden_layers',
         default=2,
         type=int,
         help='Number of hidden layers in the Transformer encoder.')
     parser.add_argument(
         '--synapse.num_attention_heads',
         default=2,
         type=int,
         help=
         'Number of attention heads for each attention layer in the Transformer encoder.'
     )
     parser.add_argument(
         '--synapse.n_block_filter',
         default=100,
         type=int,
         help='Stale neurons are filtered after this many blocks.')
     PKMRouter.add_args(parser)

Beispiel #8

Datei anzeigen

    def add_args(parser: argparse.ArgumentParser):
        """ Add model params
        """
        parser.add_argument(
            '--synapse.n_head',
            default=32,
            type=int,
            help=
            'Number of attention heads for each attention layer in the Transformer encoder.'
        )

        parser.add_argument(
            '--synapse.n_layer',
            default=12,
            type=int,
            help='Number of hidden layers in the Transformer encoder.')

        parser.add_argument(
            '--synapse.block_size',
            default=20,
            type=int,
            help='Number of hidden layers in the Transformer encoder.')

        parser.add_argument('--synapse.embd_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT embedding dropout probability.')

        parser.add_argument('--synapse.resid_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT residual dropout probability.')

        parser.add_argument('--synapse.attn_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT attention dropout probability.')

        PKMRouter.add_args(parser)

Beispiel #9

Datei anzeigen

Datei: dpn.py Projekt: il-dar/bittensor

    def add_args(parser: argparse.ArgumentParser):
        r""" This function adds the configuration items for the DPN synapse.
        These args are use to instantiate a Dual Path model. 
        Instantiating a configuration with the defaults will yield a "shallow" DPN-26 configuration. 

        For deeper network configurations, it is possible to set the num_blocks parameter to (3, 4, 20, 3) for a
        DPN-92. 
        
        For DPN-98 set the following:
            in_planes: (160, 320, 640, 1280)
            out_planes: (256, 512, 1024, 2048)
            num_blocks: (3, 6, 20, 3)
            dense_depth: (16, 32, 32, 128)
        """
        def to_list(arg):
            return [int(i) for i in arg.split(",")]
        parser.add_argument('--synapse.in_planes', default='160, 320, 640, 1280', action="append", type=to_list)
        parser.add_argument('--synapse.out_planes', default='256, 512, 1024, 2048', action="append", type=to_list)
        parser.add_argument('--synapse.num_blocks', default='3, 6, 20, 3', action="append", type=to_list)
        parser.add_argument('--synapse.dense_depth', default='16, 32, 32, 128', action="append", type=to_list)
        parser.add_argument('--synapse.target_dim', default=10, type=int, help='Final logit layer dimension. i.e. 10 for CIFAR-10.')
        parser = PKMRouter.add_args(parser)

Beispiel #10

Datei anzeigen

Datei: xlm.py Projekt: il-dar/bittensor

 def check_config(config: Munch):
     assert config.synapse.n_layers > 0, "Number of hidden layers in the Transformer encoder must be > 0"
     assert config.synapse.n_heads > 0, "Number of attention heads for each attention layer in the Transformer encoder must be > 0"
     config = PKMRouter.check_config(config)

Beispiel #11

Datei anzeigen

class GPT2Synapse(bittensor.synapse.Synapse):
    def __init__(self, config, **kwargs):
        super(GPT2Synapse, self).__init__(config=config, **kwargs)
        """The full GPT language model, with context of a block size.
            Args:
                config (:obj: `munch.Munch`, `required`):
                    munched config class.
        """

        if config == None:
            config = GPT2Synapse.default_config()

        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        GPT2Synapse.check_config(config)
        self.config = config

        gpt_config = GPTConfig(vocab_size=bittensor.__vocab_size__,
                               n_embd=bittensor.__network_dim__,
                               n_head=config.synapse.n_head,
                               n_layer=config.synapse.n_layer,
                               block_size=config.synapse.block_size,
                               embd_pdrop=config.synapse.embd_pdrop,
                               resid_pdrop=config.synapse.resid_pdrop,
                               attn_pdrop=config.synapse.attn_pdrop)
        # Token embedding layer.
        # [bittensor.__vocab_size__, bittensor.__network_dim__]
        self.tok_emb = nn.Embedding(gpt_config.vocab_size, gpt_config.n_embd)

        # Positional embedding.
        # [1, block_size, bittensor.__network_dim__]
        self.pos_emb = nn.Parameter(
            torch.zeros(1, gpt_config.block_size, gpt_config.n_embd))
        self.drop = nn.Dropout(gpt_config.embd_pdrop)

        # Transformer blocks
        self.blocks = nn.Sequential(
            *[Block(gpt_config) for _ in range(gpt_config.n_layer)])

        # Decoder head
        self.ln_f = nn.LayerNorm(gpt_config.n_embd)

        # Head
        # [ bittensor.__network_dim__, bittensor.__network_dim__ ]
        self.head = nn.Linear(gpt_config.n_embd,
                              bittensor.__network_dim__,
                              bias=False)

        # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query.
        self.pooler = GPTPooler(gpt_config)

        # Router: (PKM layer) queries network using pooled embeddings as context.
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # Hidden layer
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # Target layer
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      gpt_config.vocab_size,
                                      bias=False)

        # Block size here corresponds to sequence lengths
        self.block_size = gpt_config.block_size
        self.apply(self._init_weights)

        # Loss function: MLM cross-entropy loss.
        # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1]
        self.loss_fct = nn.CrossEntropyLoss()

        self.num_parameters = sum(p.numel() for p in self.parameters())
        self.to(self.device)

    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        GPT2Synapse.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        """ Add model params
        """
        parser.add_argument(
            '--synapse.n_head',
            default=32,
            type=int,
            help=
            'Number of attention heads for each attention layer in the Transformer encoder.'
        )

        parser.add_argument(
            '--synapse.n_layer',
            default=12,
            type=int,
            help='Number of hidden layers in the Transformer encoder.')

        parser.add_argument(
            '--synapse.block_size',
            default=20,
            type=int,
            help='Number of hidden layers in the Transformer encoder.')

        parser.add_argument('--synapse.embd_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT embedding dropout probability.')

        parser.add_argument('--synapse.resid_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT residual dropout probability.')

        parser.add_argument('--synapse.attn_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT attention dropout probability.')

        PKMRouter.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        pass

    def get_block_size(self):
        return self.block_size

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)

            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()

        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward_text(self, inputs: torch.LongTensor):
        """ Local forward inputs through the CLM GPT Synapse.

            Args:
                inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of tokenized sentences.
            
            Returns:
                hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): 
                    Hidden layer representation produced using the local_context.
        """
        # Truncate seq length of incoming inputs if they are too long
        initial_length = inputs.size(1)
        inputs = inputs if initial_length <= self.block_size else inputs[:,
                                                                         -self.
                                                                         block_size:]
        hidden = self.local_forward(inputs=inputs.to(self.device),
                                    training=False).local_hidden

        # Now pad the output tensor back to the original length
        if initial_length > self.block_size:
            diff = initial_length - self.block_size
            padding = (0, 0, diff, 0)
            hidden = torch.nn.functional.pad(hidden, padding, "constant", 0)

        return hidden

    def local_forward(self,
                      inputs: torch.LongTensor,
                      training: bool = True) -> SimpleNamespace:
        """ Forward pass through GPT2 synapse.

            Args:
                inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, block_size)`, `required`): 
                    Batch_size length x list of text sentences.

                training (:obj:`bool')`, `optional`, defaults to True):
                    Switch to True if this forward pass computes a CLM loss.

            SimpleNamespace {
                    local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`):
                        Hidden layer context.

                    local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`):
                        Hidden layer encoding produced using local_context.

                    local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__vocab_size__)`, `optional`):
                        GPT MLM Target predictions produced using local_context. 

                    local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        GPT MLM loss using local_context.
                }
        """
        _, t = inputs.size()
        assert t <= self.block_size, "Cannot forward, model block size is exhausted."

        # FWD locally
        # Each index maps to a learnable vector
        token_embeddings = self.tok_emb(inputs)
        # Each Position maps to a learnable vector
        position_embeddings = self.pos_emb[:, :t, :]

        output = SimpleNamespace()
        # Dropout on token embeddings and position embeddings
        out = self.drop(token_embeddings + position_embeddings)

        #
        out = self.blocks(out)
        out = self.ln_f(out)
        output.local_context = self.head(out)

        output.local_hidden = self.hidden_layer(output.local_context)

        if training:
            output.local_target = self.target_layer(output.local_hidden)

            shift_logits = output.local_target[..., :-1, :].contiguous()
            shift_labels = inputs[..., 1:].contiguous()

            output.local_target_loss = self.loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))

        return output

    def remote_forward(self, neuron: bittensor.neuron.Neuron,
                       inputs: torch.LongTensor,
                       training: bool) -> SimpleNamespace:
        """ Forward pass inputs and labels through the GPT2 module and into the remote network.


        Args:
            neuron (:obj: `bittensor.neuron.Neuron`, `required`):
                    Bittensor neuron, used for making queries to the remote network.

            inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of text sentences.

            training (:obj:`bool')`, `optional`, defaults to True):
                Switch to True if this forward pass computes an MLM loss.

        Returns:
            self.local_forward() + SimpleNamespace ( 

                    remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `optional`): 
                        Hidden layer encoding produced using the remote_context.

                    remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,  bittensor.__vocab_size__)`, `optional`):
                        GPT MLM Target predictions using the remote_context.

                    remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`):
                        GPT MLM loss using the remote_context.

                    distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        Distillation loss between local_context and remote_context.

                    router (:obj:`SimpleNamespace`, `required`): 
                        Outputs from the pkm dendrite.
            )
        """
        inputs = torch.clamp(
            inputs, 0, bittensor.__vocab_size__)  # Filter out of range tokens.
        # Run local model
        # output = SimpleNamespace
        output = self.local_forward(inputs, training)

        # pooled: pooled hidden layer from local run, used as query context
        pooled = self.pooler(output.local_hidden.detach())

        # remote_context: joined responses from a dendrite.forward_text call.
        # remote_context.shape = [batch_size, sequence_len (or block_size), bittensor.__network_dim__]
        output.router = self.router.forward_text(neuron,
                                                 inputs.to(self.device),
                                                 pooled)
        remote_context = output.router.response.to(self.device)

        # distillation_loss : distillation loss between local_context and remote_context
        # distillation_loss.shape = [1]
        output.distillation_loss = F.mse_loss(output.local_context,
                                              remote_context.detach())

        # remote_hidden: hidden l;ayer encoding using remote_context.
        # remote_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.remote_hidden = self.hidden_layer(remote_context)

        if training:
            # remote_target : projection of remote_hidden onto the target dimension
            # remote_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__]
            output.remote_target = self.target_layer(output.remote_hidden)

            # remote_target_loss : CLM loss between remote_target and passed_targets.
            # remote_target_loss.shape = [1]
            shift_logits = output.remote_target[..., :-1, :].contiguous()
            shift_labels = inputs[..., 1:].contiguous()

            output.remote_target_loss = self.loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))
            shift_labels.view(-1)

        return output

Beispiel #12

Datei anzeigen

 def check_config(config: Munch):
     assert config.synapse.target_dim > 0, "target dimension must be greater than 0."
     config = PKMRouter.check_config(config)

Beispiel #13

Datei anzeigen

class FFNNSynapse(bittensor.synapse.Synapse):
    """ Simple feed forward NN for images.
    """
    def __init__(self, config: Munch, **kwargs):
        r""" Init a new ffnn synapse module.
                :param [config]: munch namespace config item.
                :type [config]:  [:obj:`munch.Munch`](, `required`)

        """
        super(FFNNSynapse, self).__init__(config=config, **kwargs)
        if config == None:
            config = FFNNSynapse.default_config()
        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        FFNNSynapse.check_config(config)
        self.config = config

        # transform_layer: transforms images to common dimension.
        # [batch_size, -1, -1, -1] -> [batch_size, self.transform_dim]
        self.transform = Normalize((0.1307, ), (0.3081, ), device=self.device)
        self.transform_pool = nn.AdaptiveAvgPool2d((28, 28))
        self.transform_conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.transform_conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.transform_drop = nn.Dropout2d()
        self.transform_dim = 320

        # context_layer: distills the remote_context from the transform layer.
        # [batch_size, transform_dim] -> [batch_size, bittensor.__network_dim__]
        self.context_layer1 = nn.Linear(self.transform_dim, 256)
        self.context_layer2 = nn.Linear(256, bittensor.__network_dim__)

        # hidden_layer: learns hidden units for network and target.
        # [batch_size, transform_dim + bittensor.__network_dim__] = [batch_size, bittensor.__network_dim__]
        self.hidden_layer1 = nn.Linear(
            self.transform_dim + bittensor.__network_dim__,
            bittensor.__network_dim__)
        self.hidden_layer2 = nn.Linear(bittensor.__network_dim__,
                                       bittensor.__network_dim__)

        # dendrite: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # target_layer: Maps from hidden layer to target dimension
        # [batch_size, bittensor.__network_dim__] -> [batch_size, self.target_dim]
        self.target_layer1 = nn.Linear(bittensor.__network_dim__, 256)
        self.target_layer2 = nn.Linear(256, self.config.synapse.target_dim)

        self.to(self.device)

    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        FFNNSynapse.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        parser.add_argument(
            '--synapse.target_dim',
            default=10,
            type=int,
            help='Final logit layer dimension. i.e. 10 for MNIST.')
        parser = PKMRouter.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        assert config.synapse.target_dim > 0, "target dimension must be greater than 0."
        config = PKMRouter.check_config(config)

    def forward_image(self, images: torch.Tensor):
        r""" Forward image inputs through the FFNN synapse .

                Args:
                    inputs (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_dim, channels, rows, cols)`, `required`): 
                        Image tensors produced by calling PIL.toTensor() and with sequence dimension.
                
                Returns:
                    hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_dim, bittensor.__network_dim__)`, `required`): 
                        Hidden layer encoding produced by using local_context.
        """
        # images: remove sequence dimension from images.
        # images.shape = [batch_size, channels, rows, cols]
        images = images.view(images.shape[0] * images.shape[1],
                             images.shape[2], images.shape[3],
                             images.shape[4]).to(self.device)

        # hidden: hidden layer using local_contextcontext for local computation only.
        # hidden.shape = [batch_size, __network_dim__]
        hidden = self.local_forward(images=images).local_hidden

        # hidden: re-add sequence dimension to outputs.
        # hidden.shape = [batch_size, sequence_dim, __network_dim__]
        hidden = torch.unsqueeze(hidden, 1)

        return hidden

    def local_forward(self,
                      images: torch.Tensor,
                      targets: torch.Tensor = None) -> SimpleNamespace:
        r""" Forward pass non-sequential image inputs and targets through the FFNN Synapse. The call does not make 
        remote queries to the network and returns only local hidden, target and losses.

        Args:
            images (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, channels, rows, cols)`, `required`): 
                PIL.toTensor() encoded images.

            targets (:obj:`torch.FloatTensor`  of shape :obj:`(batch_size, target_dim)`, `optional`, defaults to None): 
                Image labels.

        Returns:
            local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`):
                Pre-Hidden layer context, trained to match the remote context.

            local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`):
                Hidden layer produced from the context.

            local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`):
                FFNN Target predictions using local_context. 

            local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                FFNN Classification loss using local_context.

            local_accuracy (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                Accuracy of target predictions.
            
        """

        # Return vars to be filled.
        output = SimpleNamespace()

        # transform: transform images to common shape.
        # transform.shape = [batch_size, self.transform_dim]
        transform = self.transform(images).to(self.device)
        transform = F.relu(F.max_pool2d(self.transform_conv1(transform), 2))
        transform = F.relu(
            F.max_pool2d(self.transform_drop(self.transform_conv2(transform)),
                         2))
        output.transform = transform.view(-1, self.transform_dim)

        # local_context: distillation model for remote_context.
        # local_context.shape = [batch_size, bittensor.__network_dim__]
        local_context = self.context_layer1(output.transform.detach())
        output.local_context = self.context_layer2(local_context)

        # local_hidden: hidden layer encoding using local_context.
        # local_hidden.shape = [batch_size, bittensor.__network_dim__]
        local_hidden = torch.cat(
            (output.transform, output.local_context.detach()), dim=1)
        local_hidden = F.relu(self.hidden_layer1(local_hidden))
        output.local_hidden = F.relu(self.hidden_layer2(local_hidden))

        if targets is not None:
            # local_target: projection of local_hidden onto target dimension.
            # local_target.shape = [batch_size, target_dim]
            targets.to(self.device)
            local_target = self.target_layer1(output.local_hidden)
            local_target = self.target_layer2(local_target)
            output.local_target = F.log_softmax(local_target, dim=1)

            # local_target_loss: loss between local_target and passed targets.
            # local_target_loss.shape = [1]
            output.local_target_loss = F.nll_loss(output.local_target, targets)

            # Record extra metadata accuracy.
            max_logit = local_target.data.max(1, keepdim=True)[1]
            correct = max_logit.eq(targets.data.view_as(max_logit)).sum()
            output.local_accuracy = (100.0 * correct) / targets.shape[0]

        return output

    def remote_forward(self,
                       neuron: bittensor.neuron.Neuron,
                       images: torch.Tensor,
                       targets: torch.Tensor = None) -> SimpleNamespace:
        """
            Forward pass non-sequential image inputs and targets through the remote context of the synapse. The call
            makes RPC queries accross the network using the passed neuron's metagraph and dendrite.
            
            Args:
                neuron (:obj: `bittensor.neuron.Neuron`, `required`):
                    Bittensor neuron, used for making queries to the remote network.

                images (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, channels, rows, cols)`, `required`): 
                    PIL.toTensor() encoded images.
                                
                targets (:obj:`torch.FloatTensor`  of shape :obj:`(batch_size, target_dim)`, `optional`, defaults to None): 
                    Image labels.
            
            Returns:
                self.local_forward() + SimpleNamespace ( 

                    router (:obj:`SimpleNamespace`, `required`): 
                        Outputs from the pkm dendrite remote call.

                    distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        Distillation loss between the local and remote context.

                    remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `optional`): 
                        Hidden layer encoding produced using the remote context.

                    remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`):
                        FFNN Target predictions using the remote_context.

                    remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`):
                        FFNN Classification loss using the remote_context.
            
        """
        # Call the local forward pass.
        # output = bittensor.SynapseOutput
        output = self.local_forward(images, targets)

        # Make remote queries using the PKMRouter.
        # remote_context: responses from a bittensor remote network call.
        # remote_context.shape = [batch_size, bittensor.__network_dim__]
        images = torch.unsqueeze(images, 1)
        output.router = self.router.forward_image(neuron, images,
                                                  output.local_hidden)
        remote_context = torch.squeeze(output.router.response,
                                       1).to(self.device)

        # Distill the local context to match the remote context.
        # distillation_loss: distillation loss between local_context and remote_context
        # distillation_loss.shape = [1]
        output.distillation_loss = F.mse_loss(output.local_context,
                                              remote_context.detach())

        # remote_hidden: hidden layer encoding using remote_context.
        # remote_hidden.shape = [batch_size, bittensor.__network_dim__]
        remote_hidden = torch.cat([output.transform, remote_context], dim=1)
        remote_hidden = self.hidden_layer1(remote_hidden)
        output.remote_hidden = self.hidden_layer2(remote_hidden)

        if targets is not None:
            # Project hidden units onto the targets.
            # remote_target: projection of remote_hidden onto target dimension.
            # remote_target.shape = [batch_size, target_dim]
            remote_target = self.target_layer1(remote_hidden)
            remote_target = self.target_layer2(remote_target)
            output.remote_target = F.log_softmax(remote_target, dim=1)

            # Compute the target loss.
            # remote_target_loss: loss between remote_target and passed targets.
            # remote_target_loss.shape = [1]
            output.remote_target_loss = F.nll_loss(output.remote_target,
                                                   targets)

            # Add extra metrics
            # Record extra metadata accuracy.
            max_logit = output.remote_target.data.max(1, keepdim=True)[1]
            correct = max_logit.eq(targets.data.view_as(max_logit)).sum()
            output.remote_accuracy = (100.0 * correct) / targets.shape[0]

        return output

Beispiel #14

Datei anzeigen

class GPT2LMSynapse(bittensor.synapse.Synapse):
    """ A Bittensor Synapse training GPT2 with Causal Language Modelling (CLM)
    """
    def __init__(self, config: Munch):
        r""" Init a new GPT2 synapse module.

            Args:
                config (:obj:`munch.Munch`, `required`): 
                    munched config class.
        """
        super(GPT2LMSynapse, self).__init__(config=config)
        if config == None:
            config = GPT2LMSynapse.build_config()

        # Build hugging face config.
        huggingface_config = GPT2Config(
            vocab_size=bittensor.__vocab_size__,
            n_embd=bittensor.__network_dim__,
            n_layer=config.synapse.n_layer,
            n_head=config.synapse.n_head,
            n_inner=config.synapse.n_inner,
            activation_function=config.synapse.activation_function,
            resid_pdrop=config.synapse.resid_pdrop,
            embd_pdrop=config.synapse.embd_pdrop,
            attn_pdrop=config.synapse.attn_pdrop,
            layer_norm_epsilon=config.synapse.layer_norm_epsilon,
            initializer_range=config.synapse.initializer_range,
            summary_type=config.synapse.summary_type,
            summary_use_proj=config.synapse.summary_use_proj,
            summary_activation=config.synapse.summary_activation,
            summary_proj_to_labels=config.synapse.summary_proj_to_labels,
            summary_first_dropout=config.synapse.summary_first_dropout,
        )

        # encoder_layer: encodes tokenized sequences to network dim.
        # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.transformer = GPT2Model(huggingface_config)

        # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query.
        # [batch_size, bittensor.__network_dim__, sequence_len] -> [batch_size, bittensor.__network_dim__]
        self.pooler = GPT2Pooler(huggingface_config)

        # router: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, bittensor.__network_dim__] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # hidden_layer: transforms context and encoding to network_dim hidden units.
        # [batch_size, sequence_dim, 2 * bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # target_layer: maps from hidden layer to vocab dimension for each token. Used by MLM loss.
        # [batch_size, sequence_len, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__vocab_size__]
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__vocab_size__,
                                      bias=False)

        # Loss function: MLM cross-entropy loss.
        # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1]
        self.loss_fct = nn.CrossEntropyLoss()

        self.to(self.device)

    @staticmethod
    def build_config() -> Munch:
        parser = argparse.ArgumentParser()
        GPT2LMSynapse.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        GPT2LMSynapse.check_config(config)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        r""" Add custom params to the parser.
        """
        parser.add_argument(
            '--synapse.n_head',
            default=1,
            type=int,
            help=
            'Number of attention heads for each attention layer in the Transformer encoder.'
        )
        parser.add_argument(
            '--synapse.n_layer',
            default=2,
            type=int,
            help='Number of hidden layers in the Transformer encoder.')
        parser.add_argument(
            '--synapse.n_inner',
            default=8,
            type=int,
            help=
            'The dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd'
        )
        parser.add_argument(
            '--synapse.activation_function',
            default='gelu_new',
            type=str,
            help=
            'Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]'
        )
        parser.add_argument('--synapse.resid_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT residual dropout probabilit.')
        parser.add_argument('--synapse.embd_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT embedding dropout probability.')
        parser.add_argument('--synapse.attn_pdrop',
                            default=0.1,
                            type=float,
                            help='GPT attention dropout probability.')
        parser.add_argument(
            '--synapse.layer_norm_epsilon',
            default=1e-05,
            type=float,
            help='GPT the epsilon to use in the layer normalization layers')
        parser.add_argument(
            '--synapse.summary_type',
            default='cls_index',
            type=str,
            help=
            'Supply a Tensor of classification token position (like GPT/GPT-2).'
        )
        parser.add_argument(
            '--synapse.initializer_range',
            default=0.02,
            type=float,
            help=
            'The standard deviation of the truncated_normal_initializer for initializing all weight matrices.'
        )
        parser.add_argument(
            '--synapse.summary_use_proj',
            default=True,
            type=bool,
            help=
            'Whether or not to add a projection after the vector extraction.')
        parser.add_argument(
            '--synapse.summary_activation',
            type=str,
            help=
            'Pass "tanh" for a tanh activation to the output, any other value will result in no activation.'
        )
        parser.add_argument(
            '--synapse.summary_proj_to_labels',
            default=True,
            type=bool,
            help=
            'Whether the projection outputs should have config.num_labels or config.hidden_size classes.'
        )
        parser.add_argument(
            '--synapse.summary_first_dropout',
            default=0.1,
            type=float,
            help=
            'The dropout ratio to be used after the projection and activation.'
        )
        parser.add_argument(
            '--synapse.n_block_filter',
            default=100,
            type=int,
            help='Stale neurons are filtered after this many blocks.')
        PKMRouter.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        pass

    def forward_text(self, inputs: torch.LongTensor):
        """ Local forward inputs through the MLM GPT Synapse.

            Args:
                inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of tokenized sentences.
            
            Returns:
                hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): 
                    Hidden layer representation produced using the local_context.
        """
        hidden = self.local_forward(inputs=inputs.to(self.device),
                                    training=False).local_hidden
        return hidden

    def local_forward(self,
                      inputs: torch.LongTensor,
                      training: bool = True) -> SimpleNamespace:
        r""" Forward pass through GPT synapse.

            Args:
                inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of text sentences.

                training (:obj:`bool')`, `optional`, defaults to True):
                    Switch to True if this forward pass computes an MLM loss.

            SimpleNamespace {
                    local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`):
                        Hidden layer context.

                    local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`):
                        Hidden layer encoding produced using local_context.

                    local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__vocab_size__)`, `optional`):
                        GPT MLM Target predictions produced using local_context. 

                    local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        GPT MLM loss using local_context.
                }
        """
        inputs = torch.clamp(
            inputs, 0, bittensor.__vocab_size__)  # Filter out of range tokens.

        # Return vars to be filled.
        output = SimpleNamespace()

        # local_context: distilled version of remote_context.
        # local_context.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.local_context = self.transformer(
            input_ids=inputs, return_dict=True).last_hidden_state

        # local_hidden: hidden layer encoding of sequence with local_context.
        # local_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.local_hidden = self.hidden_layer(output.local_context)

        if training:
            # local_target: projection of local_hidden onto target dimension.
            # local_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__]
            output.local_target = self.target_layer(output.local_hidden)

            # local_target_loss: MLM loss between local_target and passed targets.
            # local_target_loss.shape = [1]
            shift_logits = output.local_target[..., :-1, :].contiguous()
            shift_labels = inputs[..., 1:].contiguous()
            output.local_target_loss = self.loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))

        return output

    def remote_forward(self, neuron: bittensor.neuron.Neuron,
                       inputs: torch.LongTensor,
                       training: bool) -> SimpleNamespace:
        """ Forward pass inputs and labels through the GPT2 module.


        Args:
            neuron (:obj: `bittensor.neuron.Neuron`, `required`):
                    Bittensor neuron, used for making queries to the remote network.

            inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of text sentences.

            training (:obj:`bool')`, `optional`, defaults to True):
                Switch to True if this forward pass computes an MLM loss.

        Returns:
            self.local_forward() + SimpleNamespace ( 

                    remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `optional`): 
                        Hidden layer encoding produced using the remote_context.

                    remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,  bittensor.__vocab_size__)`, `optional`):
                        GPT MLM Target predictions using the remote_context.

                    remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`):
                        GPT MLM loss using the remote_context.

                    distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        Distillation loss between local_context and remote_context.

                    router (:obj:`SimpleNamespace`, `required`): 
                        Outputs from the pkm dendrite.
            )
        """
        inputs = torch.clamp(
            inputs, 0, bittensor.__vocab_size__)  # Filter out of range tokens.

        # Run the local model.
        # output = SimpleNamespace
        output = self.local_forward(inputs, training)

        # pooled: pooled hidden layer from local run, used as our query context.
        # pooled.shape = [batch_size, bittensor.__network_dim__]
        pooled = self.pooler(output.local_hidden.detach())

        # remote_context: joined responses from a dendrite.forward_text call.
        # remote_context.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.router = self.router.forward_text(neuron,
                                                 inputs.to(self.device),
                                                 pooled)
        remote_context = output.router.response

        # distillation_loss: distillation loss between local_context and remote_context
        # distillation_loss.shape = [1]
        output.distillation_loss = F.mse_loss(output.local_context,
                                              remote_context.detach())

        # remote_hidden: hidden layer encoding using remote_context.
        # remote_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.remote_hidden = self.hidden_layer(remote_context)

        if training:
            # remote_target: projection of remote_hidden onto target dimension.
            # remote_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__]
            output.remote_target = self.target_layer(output.remote_hidden)

            # remote_target_loss: MLM loss between remote_target and passed targets.
            # remote_target_loss.shape = [1]
            shift_logits = output.remote_target[..., :-1, :].contiguous()
            shift_labels = inputs[..., 1:].contiguous()
            output.remote_target_loss = self.loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))

        return output

Beispiel #15

Datei anzeigen

Datei: bert.py Projekt: timolegros/bittensor

class BertSynapseBase(bittensor.synapse.Synapse):
    def __init__(self, config: Munch, **kwargs):
        r""" Init a new base-bert synapse.

            Args:
                config (:obj:`munch.Munch`, `required`): 
        """
        super(BertSynapseBase, self).__init__(config=config, **kwargs)
        if config == None:
            config = BertSynapseBase.default_config()
        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        BertSynapseBase.check_config(config)
        self.config = config

        # Hugging face config item.
        huggingface_config = BertConfig(
            vocab_size=bittensor.__vocab_size__,
            hidden_size=bittensor.__network_dim__,
            num_hidden_layers=config.synapse.num_hidden_layers,
            num_attention_heads=config.synapse.num_attention_heads,
            intermediate_size=bittensor.__network_dim__,
            is_decoder=False)

        # dendrite: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # encoder_layer: encodes tokenized sequences to network dim.
        # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.transformer = BertModel(huggingface_config,
                                     add_pooling_layer=True)

        # hidden_layer: transforms context and encoding to network_dim hidden units.
        # [batch_size, sequence_dim, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.hidden_layer = torch.nn.Linear(bittensor.__network_dim__,
                                            bittensor.__network_dim__)

        # pooling_layer: transforms teh hidden layer into a pooled representation by taking the encoding of the first token
        # [batch_size, sequence_dim,  bittensor.__network_dim__] -> [batch_size, bittensor.__network_dim__]
        self.pooler = BertPooler(huggingface_config)

        self.to(self.device)

    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        BertSynapseBase.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        r""" Add custom params to the parser.
        """
        parser.add_argument(
            '--synapse.num_hidden_layers',
            default=2,
            type=int,
            help='Number of hidden layers in the Transformer encoder.')
        parser.add_argument(
            '--synapse.num_attention_heads',
            default=2,
            type=int,
            help=
            'Number of attention heads for each attention layer in the Transformer encoder.'
        )
        parser.add_argument(
            '--synapse.n_block_filter',
            default=100,
            type=int,
            help='Stale neurons are filtered after this many blocks.')
        PKMRouter.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        r""" Add custom checks to the config.
        """
        pass

    def forward_text(self, inputs: torch.LongTensor):
        """ Local forward inputs through the BERT NSP Synapse.

            Args:
                inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of tokenized sentences.
            
            Returns:
                hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): 
                    Hidden layer representation produced using the local_context.
        """
        hidden = self.base_local_forward(inputs=inputs).local_hidden
        return hidden

    def base_local_forward(self,
                           inputs: torch.LongTensor,
                           attention_mask: torch.LongTensor = None):
        r""" Forward pass inputs and labels through the NSP BERT module.

            Args:
                inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of text sentences.

                attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `optional`): 
                    Mask to avoid performing attention on padding token indices.
                    Mask values selected in ``[0, 1]``:
                        - 1 for tokens that are **not masked**,
                        - 0 for tokens that are **maked**.    

             SimpleNamespace {
                    local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`):
                        Hidden layer context.

                    local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`):
                        Hidden layer encoding produced using local_context.

                    local_pooled (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`):
                        Local hidden state pooled by returning the encoding of the first token.
                }
        """
        inputs = torch.clamp(
            inputs, 0, bittensor.__vocab_size__)  # Filter out of range tokens.
        # Return vars to be filled.
        output = SimpleNamespace()

        # local_context: distilled version of remote_context.
        # local_context.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.local_context = self.transformer(
            input_ids=inputs, return_dict=True,
            attention_mask=attention_mask).last_hidden_state

        # local_hidden: hidden layer encoding of sequence using local context
        # local_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.local_hidden = self.hidden_layer(output.local_context)
        output.local_pooled = self.pooler(output.local_hidden)

        return output

    def base_remote_forward(self,
                            neuron: bittensor.neuron.Neuron,
                            inputs: torch.LongTensor,
                            attention_mask: torch.LongTensor = None):
        """Forward pass inputs and labels through the remote BERT networks.

        Args:
            neuron (:obj: `bittensor.Neuron`, `required`):
                    Bittensor neuron, used for making queries to the remote network.

            inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of text sentences.                

            attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `optional`): 
                    Mask to avoid performing attention on padding token indices.
                    Mask values selected in ``[0, 1]``:
                        - 1 for tokens that are **not masked**,
                        - 0 for tokens that are **maked**.        

        Returns:
            SimpleNamespace ( 
                    distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        Distillation loss between local_context and remote_context.

                    remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `optional`): 
                        Hidden layer encoding produced using the remote_context.

                    router (:obj:`SimpleNamespace`, `required`): 
                        Outputs from the pkm dendrite.
                )
        """
        inputs = torch.clamp(
            inputs, 0, bittensor.__vocab_size__)  # Filter out of range tokens.
        output = self.base_local_forward(inputs=inputs,
                                         attention_mask=attention_mask)

        # remote_context: joined responses from a bittensor.forward_text call.
        # remote_context.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.router = self.router.forward_text(neuron=neuron,
                                                 text=inputs,
                                                 query=output.local_pooled)

        # distillation_loss: distillation loss between local_context and remote_context
        # distillation_loss.shape = [1]
        output.distillation_loss = F.mse_loss(output.local_context,
                                              output.router.response.detach())

        # remote_hidden: hidden layer encoding using remote_context.
        # remote_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.remote_hidden = self.hidden_layer(output.router.response)
        output.remote_pooled = self.pooler(output.remote_hidden)

        return output

Beispiel #16

Datei anzeigen

 def add_args(parser: argparse.ArgumentParser):
     r""" Add custom params to the parser.
     """
     parser.add_argument(
         '--synapse.n_head',
         default=1,
         type=int,
         help=
         'Number of attention heads for each attention layer in the Transformer encoder.'
     )
     parser.add_argument(
         '--synapse.n_layer',
         default=2,
         type=int,
         help='Number of hidden layers in the Transformer encoder.')
     parser.add_argument(
         '--synapse.n_inner',
         default=8,
         type=int,
         help=
         'The dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd'
     )
     parser.add_argument(
         '--synapse.activation_function',
         default='gelu_new',
         type=str,
         help=
         'Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]'
     )
     parser.add_argument('--synapse.resid_pdrop',
                         default=0.1,
                         type=float,
                         help='GPT residual dropout probabilit.')
     parser.add_argument('--synapse.embd_pdrop',
                         default=0.1,
                         type=float,
                         help='GPT embedding dropout probability.')
     parser.add_argument('--synapse.attn_pdrop',
                         default=0.1,
                         type=float,
                         help='GPT attention dropout probability.')
     parser.add_argument(
         '--synapse.layer_norm_epsilon',
         default=1e-05,
         type=float,
         help='GPT the epsilon to use in the layer normalization layers')
     parser.add_argument(
         '--synapse.summary_type',
         default='cls_index',
         type=str,
         help=
         'Supply a Tensor of classification token position (like GPT/GPT-2).'
     )
     parser.add_argument(
         '--synapse.initializer_range',
         default=0.02,
         type=float,
         help=
         'The standard deviation of the truncated_normal_initializer for initializing all weight matrices.'
     )
     parser.add_argument(
         '--synapse.summary_use_proj',
         default=True,
         type=bool,
         help=
         'Whether or not to add a projection after the vector extraction.')
     parser.add_argument(
         '--synapse.summary_activation',
         type=str,
         help=
         'Pass "tanh" for a tanh activation to the output, any other value will result in no activation.'
     )
     parser.add_argument(
         '--synapse.summary_proj_to_labels',
         default=True,
         type=bool,
         help=
         'Whether the projection outputs should have config.num_labels or config.hidden_size classes.'
     )
     parser.add_argument(
         '--synapse.summary_first_dropout',
         default=0.1,
         type=float,
         help=
         'The dropout ratio to be used after the projection and activation.'
     )
     parser.add_argument(
         '--synapse.n_block_filter',
         default=100,
         type=int,
         help='Stale neurons are filtered after this many blocks.')
     PKMRouter.add_args(parser)

Beispiel #17

Datei anzeigen

Datei: xlm.py Projekt: timolegros/bittensor

    def add_args(parser: argparse.ArgumentParser):
        """ Add custom params to the Synapse

        Args:
            parser (:obj:`argparse.AgumentParser`): Argument Parser object.

        """
        parser.add_argument(
            '--synapse.emb_dim',
            default=bittensor.__network_dim__,
            type=int,
            help='Dimensionality of the encoder layers and the pooler layer.')
        parser.add_argument(
            '--synapse.n_layers',
            default=12,
            type=int,
            help='Number of hidden layers in the Transformer encoder.')
        parser.add_argument(
            '--synapse.n_heads',
            default=16,
            type=int,
            help=
            'Number of attention heads for each attention layer in the Transformer encoder.'
        )
        parser.add_argument(
            '--synapse.dropout',
            default=0.1,
            type=float,
            help=
            'The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.'
        )
        parser.add_argument(
            '--synapse.attention_dropout',
            default=0.1,
            type=float,
            help='The dropout probability for the attention mechanism.')
        parser.add_argument(
            '--synapse.gelu_activation',
            default=True,
            type=bool,
            help=
            'Whether or not to use gelu for the activations instead of relu.')
        parser.add_argument(
            '--synapse.sinusoidal_embeddings',
            default=False,
            type=bool,
            help=
            'Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.'
        )
        parser.add_argument(
            '--synapse.causal',
            default=False,
            type=bool,
            help=
            'Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in order to only attend to the left-side context instead if a bidirectional context.'
        )
        parser.add_argument(
            '--synapse.asm',
            default=False,
            type=bool,
            help=
            'Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction layer.'
        )
        parser.add_argument(
            '--synapse.n_langs',
            default=1,
            type=int,
            help=
            'The number of languages the model handles. Set to 1 for monolingual models.'
        )
        parser.add_argument(
            '--synapse.use_lang_emb',
            default=True,
            type=bool,
            help=
            'Whether to use language embeddings. Some models use additional language embeddings, see the multilingual models page for information on how to use them.'
        )
        parser.add_argument(
            '--synapse.max_position_embeddings',
            default=512,
            type=bool,
            help=
            'The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).'
        )
        parser.add_argument(
            '--synapse.embed_init_std',
            default=pow(2048, -0.5),
            type=float,
            help=
            'The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.'
        )
        parser.add_argument(
            '--synapse.init_std',
            default=50257,
            type=int,
            help=
            'The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the embedding matrices.'
        )
        parser.add_argument(
            '--synapse.layer_norm_eps',
            default=pow(1, -12),
            type=float,
            help='The epsilon used by the layer normalization layers.')
        parser.add_argument(
            '--synapse.bos_index',
            default=0,
            type=int,
            help=
            'The index of the beginning of sentence token in the vocabulary.')
        parser.add_argument(
            '--synapse.eos_index',
            default=1,
            type=int,
            help='The index of the end of sentence token in the vocabulary.')
        parser.add_argument(
            '--synapse.pad_index',
            default=2,
            type=int,
            help='The index of the padding token in the vocabulary.')
        parser.add_argument(
            '--synapse.unk_index',
            default=3,
            type=int,
            help='The index of the unknown token in the vocabulary.')
        parser.add_argument(
            '--synapse.mask_index',
            default=5,
            type=int,
            help='The index of the masking token in the vocabulary.')
        parser.add_argument(
            '--synapse.is_encoder',
            default=True,
            type=bool,
            help=
            'Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.'
        )
        parser.add_argument(
            '--synapse.summary_type',
            default="first",
            type=str,
            help=
            'Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.'
        )
        parser.add_argument(
            '--synapse.summary_use_proj',
            default=True,
            type=bool,
            help=
            'Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. Whether or not to add a projection after the vector extraction.'
        )
        parser.add_argument(
            '--synapse.summary_activation',
            type=str,
            help=
            'Pass "tanh" for a tanh activation to the output, any other value will result in no activation.'
        )
        parser.add_argument(
            '--synapse.summary_proj_to_labels',
            default=True,
            type=bool,
            help=
            'Whether the projection outputs should have config.num_labels or config.hidden_size classes.'
        )
        parser.add_argument(
            '--synapse.summary_first_dropout',
            default=0.1,
            type=float,
            help=
            'The dropout ratio to be used after the projection and activation.'
        )
        parser.add_argument('--synapse.start_n_top',
                            default=5,
                            type=int,
                            help=' Used in the SQuAD evaluation script.')
        parser.add_argument('--synapse.end_n_top',
                            default=5,
                            type=int,
                            help='Used in the SQuAD evaluation script.')
        parser.add_argument(
            '--synapse.mask_token_id',
            default=0,
            type=int,
            help=
            'Model agnostic parameter to identify masked tokens when generating text in an MLM context.'
        )
        parser.add_argument(
            '--synapse.lang_id',
            default=1,
            type=int,
            help=
            'The ID of the language used by the model. This parameter is used when generating text in a given language.'
        )
        PKMRouter.add_args(parser)

Beispiel #18

Datei anzeigen

    def __init__(self, config, **kwargs):
        super(GPT2Synapse, self).__init__(config=config, **kwargs)
        """The full GPT language model, with context of a block size.
            Args:
                config (:obj: `munch.Munch`, `required`):
                    munched config class.
        """

        if config == None:
            config = GPT2Synapse.default_config()

        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        GPT2Synapse.check_config(config)
        self.config = config

        gpt_config = GPTConfig(vocab_size=bittensor.__vocab_size__,
                               n_embd=bittensor.__network_dim__,
                               n_head=config.synapse.n_head,
                               n_layer=config.synapse.n_layer,
                               block_size=config.synapse.block_size,
                               embd_pdrop=config.synapse.embd_pdrop,
                               resid_pdrop=config.synapse.resid_pdrop,
                               attn_pdrop=config.synapse.attn_pdrop)
        # Token embedding layer.
        # [bittensor.__vocab_size__, bittensor.__network_dim__]
        self.tok_emb = nn.Embedding(gpt_config.vocab_size, gpt_config.n_embd)

        # Positional embedding.
        # [1, block_size, bittensor.__network_dim__]
        self.pos_emb = nn.Parameter(
            torch.zeros(1, gpt_config.block_size, gpt_config.n_embd))
        self.drop = nn.Dropout(gpt_config.embd_pdrop)

        # Transformer blocks
        self.blocks = nn.Sequential(
            *[Block(gpt_config) for _ in range(gpt_config.n_layer)])

        # Decoder head
        self.ln_f = nn.LayerNorm(gpt_config.n_embd)

        # Head
        # [ bittensor.__network_dim__, bittensor.__network_dim__ ]
        self.head = nn.Linear(gpt_config.n_embd,
                              bittensor.__network_dim__,
                              bias=False)

        # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query.
        self.pooler = GPTPooler(gpt_config)

        # Router: (PKM layer) queries network using pooled embeddings as context.
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # Hidden layer
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # Target layer
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      gpt_config.vocab_size,
                                      bias=False)

        # Block size here corresponds to sequence lengths
        self.block_size = gpt_config.block_size
        self.apply(self._init_weights)

        # Loss function: MLM cross-entropy loss.
        # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1]
        self.loss_fct = nn.CrossEntropyLoss()

        self.num_parameters = sum(p.numel() for p in self.parameters())
        self.to(self.device)

Beispiel #19

Datei anzeigen

Datei: dpn.py Projekt: il-dar/bittensor

class DPNSynapse(bittensor.synapse.Synapse):
    """ Bittensor endpoint trained on PIL images to detect objects using an DPN.
    """

    def __init__( self, config: Munch = None):
        r""" Init a new DPN synapse module.

            Args:
                config (:obj: `munch.Munch`, `required`)
                    munch namespace config item.
        """
        super(DPNSynapse, self).__init__(config = config)
        if config == None:
            config = DPNSynapse.build_config()
        
        in_planes, out_planes = config.synapse.in_planes, config.synapse.out_planes
        num_blocks, dense_depth = config.synapse.num_blocks, config.synapse.dense_depth

        # Transform Network
        """ Transform network.
                Layers take in image inputs normalizes them and applies 
                4 convolutional layers. 
            Image encoder: transforms PIL-encoded tensors to a common shape.
            [batch_size, channels, rows, cols] -> [batch_size, -1, -1, -1] 

            Output: [batch_size, self.transform_dim (9728)]
        """
        self.transform = Normalize((0.1307,), (0.3081,), device=self.device)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((32, 32))
        self.transform_conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.transform_bn1 = nn.BatchNorm2d(64)
        self.last_planes = 64
        self.transform_layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
        self.transform_layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
        self.transform_layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=1)
        self.transform_layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
        self.transform_dim = (out_planes[3] * 4)+(((num_blocks[3]+1) * 4)*dense_depth[3])
        
        # dendrite: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim = self.transform_dim)

        # Context layers.
        """
            Distillation model for remote context. This layer takes input 
            coming from transform layer, and runs it through 3 linear layers,
            projecting it to bittensor.__network_dim__.  
        """
        self.context_layer1 = nn.Linear(self.transform_dim, 512)
        self.context_layer2 = nn.Linear(512, 256)
        self.context_layer3 = nn.Linear(256, bittensor.__network_dim__)

        # hidden layer.
        self.hidden_layer1 = nn.Linear(self.transform_dim + bittensor.__network_dim__, 512)
        self.hidden_layer2 = nn.Linear(512, 256)
        self.hidden_layer3 = nn.Linear(256, bittensor.__network_dim__)

        # Layers to project target down to target size passed by config
        # (number of classes)
        self.target_layer1 = nn.Linear(bittensor.__network_dim__, 128)
        self.target_layer2 = nn.Linear(128, self.config.synapse.target_dim)

        self.to(self.device)

    @staticmethod   
    def build_config() -> Munch:
        parser = argparse.ArgumentParser(); 
        DPNSynapse.add_args(parser) 
        config = bittensor.config.Config.to_config(parser); 
        DPNSynapse.check_config(config)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        r""" This function adds the configuration items for the DPN synapse.
        These args are use to instantiate a Dual Path model. 
        Instantiating a configuration with the defaults will yield a "shallow" DPN-26 configuration. 

        For deeper network configurations, it is possible to set the num_blocks parameter to (3, 4, 20, 3) for a
        DPN-92. 
        
        For DPN-98 set the following:
            in_planes: (160, 320, 640, 1280)
            out_planes: (256, 512, 1024, 2048)
            num_blocks: (3, 6, 20, 3)
            dense_depth: (16, 32, 32, 128)
        """
        def to_list(arg):
            return [int(i) for i in arg.split(",")]
        parser.add_argument('--synapse.in_planes', default='160, 320, 640, 1280', action="append", type=to_list)
        parser.add_argument('--synapse.out_planes', default='256, 512, 1024, 2048', action="append", type=to_list)
        parser.add_argument('--synapse.num_blocks', default='3, 6, 20, 3', action="append", type=to_list)
        parser.add_argument('--synapse.dense_depth', default='16, 32, 32, 128', action="append", type=to_list)
        parser.add_argument('--synapse.target_dim', default=10, type=int, help='Final logit layer dimension. i.e. 10 for CIFAR-10.')
        parser = PKMRouter.add_args(parser)
    
    @staticmethod
    def check_config(config: Munch):
        assert isinstance(config.synapse.in_planes, list), 'synapse.in_planes must be a tuple, got {}'.format(config.synapse.in_planes)
        assert isinstance(config.synapse.out_planes, list), 'synapse.out_planes must be a tuple, got {}'.format(config.synapse.out_planes)
        assert isinstance(config.synapse.num_blocks, list), 'synapse.num_blocks must be a tuple, got {}'.format(config.synapse.num_blocks)
        assert isinstance(config.synapse.dense_depth, list), 'synapse.dense_depth must be a tuple, got {}'.format(config.synapse.dense_depth)
        assert all(isinstance(el, int) for el in config.synapse.in_planes), 'synapse.in_planes must be a tuple of ints, got {}'.format(config.synapse.in_planes)
        assert all(isinstance(el, int) for el in config.synapse.out_planes), 'synapse.out_planes must be a tuple of ints, got {}'.format(config.synapse.out_planes)
        assert all(isinstance(el, int) for el in config.synapse.num_blocks), 'synapse.num_blocks must be a tuple of ints, got {}'.format(config.synapse.num_blocks)
        assert all(isinstance(el, int) for el in config.synapse.dense_depth), 'synapse.dense_depth must be a tuple of ints, got {}'.format(config.synapse.dense_depth)
    
    def forward_image ( self, images: torch.Tensor):
        r""" Forward image inputs through the DPN synapse .

            Args:
                inputs (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_dim, channels, rows, cols)`, `required`): 
                    Image tensors produced by calling PIL.toTensor() and with sequence dimension.
            
            Returns:
                hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_dim, bittensor.__network_dim__)`, `required`): 
                    Hidden layer encoding produced by using local_context.
        """
        # images: remove sequence dimension from images.
        # images.shape = [batch_size, channels, rows, cols] 
        images = images.view(images.shape[0] * images.shape[1], images.shape[2], images.shape[3], images.shape[4])

        # hidden: hidden layer using local context for local computation only.
        # hidden.shape = [batch_size, __network_dim__] 
        hidden = self.forward (images = images.to(self.device), remote = False).local_hidden
        
        # hidden: re-add sequence dimension to outputs.
        # hidden.shape = [batch_size, sequence_dim, __network_dim__] 
        hidden = torch.unsqueeze(hidden, 1)

        return hidden

    def local_forward ( self, images: torch.Tensor, targets: torch.Tensor = None ) -> SimpleNamespace:
        r""" Forward pass non-sequential image inputs and targets through the DPN Synapse.

            Args:
                images (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, channels, rows, cols)`, `required`): 
                    PIL.toTensor() encoded images.

                targets (:obj:`torch.FloatTensor`  of shape :obj:`(batch_size, config.target_size)`, `optional`): 
                    Image labels.

                remote (:obj:`bool')`, `optional`):
                    Switch between local and remote context. If true, function makes quries to the remote network.

            Returns:
                SimpleNamespace ( 
                    local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`):
                        Pre-Hidden layer context, trained to match the remote context.

                    local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`):
                        Hidden layer produced from the context.

                    local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`):
                        FFNN Target predictions using local_context. 

                    local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        FFNN Classification loss using local_context.

                    local_accuracy (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        Accuracy of target predictions.

                    transform (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, transform_dim)`, `optional`): 
                        transformation of various sized images to batch-size transform dim.
                )
        """
        # Return vars to be filled.
        output = SimpleNamespace ()

        r"""
            Transform the images into a common shape (32x32)
        """
        # transform: transform images to common shape.
        # transform.shape = [batch_size, self.transform_dim]
        transform = self.transform(images)
        transform = self.adaptive_pool(transform)
        transform = F.relu(self.transform_bn1(self.transform_conv1(transform.detach())))
        transform = self.transform_layer1(transform)
        transform = self.transform_layer2(transform)
        transform = self.transform_layer3(transform)
        transform = self.transform_layer4(transform)
        transform = F.avg_pool2d(transform, 4)
        output.transform = torch.flatten(transform, start_dim=1)

        # local_context: distillation model for remote_context.
        # local_context.shape = [batch_size, bittensor.__network_dim__]
        local_context = self.context_layer1(output.transform.detach())
        local_context = self.context_layer2(local_context)
        output.local_context = self.context_layer3(local_context)
        
        # local_hidden: hidden layer encoding using local_context.
        # local_hidden.shape = [batch_size, bittensor.__network_dim__]
        local_hidden = torch.cat([output.transform, output.local_context], dim=1)
        local_hidden = self.hidden_layer1(local_hidden)
        local_hidden = self.hidden_layer2(local_hidden)
        output.local_hidden = self.hidden_layer3(local_hidden)
        
        if targets is not None:
            # local_target: projection of local_hidden onto target dimension.
            # local_target.shape = [batch_size, target_dim]
            targets.to(self.device)
            local_target = self.target_layer1(output.local_hidden)
            local_target = self.target_layer2(local_target)
            output.local_target = F.log_softmax(local_target, dim=1)

            # local_target_loss: loss between local_target and passed targets.
            # local_target_loss.shape = [1]
            output.local_target_loss  = F.nll_loss(output.local_target, targets)

            # Record extra metadata accuracy.
            max_logit = local_target.data.max(1, keepdim=True)[1]
            correct = max_logit.eq( targets.data.view_as(max_logit) ).sum()
            output.local_accuracy = (100.0 * correct) / targets.shape[0] 
        
        return output

    def remote_forward(self, neuron: bittensor.neuron.Neuron, images: torch.Tensor, targets: torch.Tensor = None) -> SimpleNamespace:
        """
            Forward pass non-sequential image inputs and targets through the synapse. Makes RPC queries to downstream neurons.
            
            Args:
                neuron (:obj: `bittensor.neuron.Neuron`, `required`):
                    Bittensor neuron, used for making queries to the remote network.

                images (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, channels, rows, cols)`, `required`): 
                    PIL.toTensor() encoded images.
                                
                targets (:obj:`torch.FloatTensor`  of shape :obj:`(batch_size, target_dim)`, `optional`, defaults to None): 
                    Image labels.
            
            Returns:
                self.local_forward() + SimpleNamespace ( 

                    router (:obj:`SimpleNamespace`, `required`): 
                        Outputs from the pkm dendrite remote call.

                    distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        Distillation loss between the local and remote context.

                    remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `optional`): 
                        Hidden layer encoding produced using the remote context.

                    remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`):
                        FFNN Target predictions using the remote_context.

                    remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`):
                        FFNN Classification loss using the remote_context.
                )
        """
        # Call the local forward pass.
        # output = bittensor.SynapseOutput
        output = self.local_forward( images, targets ) 

        # Make remote queries using the PKMRouter.
        # remote_context: responses from a bittensor remote network call.
        # remote_context.shape = [batch_size, bittensor.__network_dim__]
        images = torch.unsqueeze(images, 1)
        output.router = self.router.forward_image( neuron, images, output.transform )
        remote_context = torch.squeeze( output.router.response, 1 ).to(self.device)

        # Distill the local context to match the remote context.
        # distillation_loss: distillation loss between local_context and remote_context
        # distillation_loss.shape = [1]
        output.distillation_loss = F.mse_loss(output.local_context, remote_context.detach() )

        # remote_hidden: hidden layer encoding using remote_context.
        # remote_hidden.shape = [batch_size, bittensor.__network_dim__]
        remote_hidden = torch.cat([output.transform, remote_context], dim=1)
        remote_hidden = self.hidden_layer1(remote_hidden)
        remote_hidden = self.hidden_layer2(remote_hidden)
        output.remote_hidden = self.hidden_layer3(remote_hidden)

        if targets is not None:
            # remote_target: projection of remote_hidden onto target dimension.
            # remote_target.shape = [batch_size, config.target_size]
            remote_target = self.target_layer1(output.remote_hidden)
            remote_target = self.target_layer2(remote_target)
            output.remote_target = F.log_softmax(remote_target, dim=1)

            # remote_target_loss: loss between remote_target and passed targets.
            # remote_target_loss.shape = [1]
            output.remote_target_loss = F.nll_loss(output.remote_target, targets)
        
        return output

    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
        """ Generates a sequential container containing Bottleneck layers.  

        Args:
            in_planes (tuple): 
                4-element tuple describing the in_planes config.

            out_planes (tuple): 
                4-element tuple describing the out_planes config.

            num_blocks (tuple): 
                4-element tuple describing the number of blocks at this layer.

            dense_depth (tuple): 
                4-element tuple describing the depth of this layer.
           
            stride (int): 
                Convolutional stride length.

        Returns:
            nn.Sequential: A torch.nn sequential container containing the layers outlined in the inputs.
        """
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for i,stride in enumerate(strides):
            layers.append(self.Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
            self.last_planes = out_planes + (i+2) * dense_depth
        return nn.Sequential(*layers)
    
    class Bottleneck(nn.Module):
        def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
            super(DPNSynapse.Bottleneck, self).__init__()
            self.out_planes = out_planes
            self.dense_depth = dense_depth

            self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
            self.bn1 = nn.BatchNorm2d(in_planes)
            self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
            self.bn2 = nn.BatchNorm2d(in_planes)
            self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
            self.bn3 = nn.BatchNorm2d(out_planes + dense_depth)

            self.shortcut = nn.Sequential()
            if first_layer:
                self.shortcut = nn.Sequential(
                    nn.Conv2d(last_planes, out_planes + dense_depth, kernel_size=1, stride=stride, bias=False),
                    nn.BatchNorm2d(out_planes + dense_depth)
                )

        def forward(self, x):
            out = F.relu(self.bn1(self.conv1(x)))
            out = F.relu(self.bn2(self.conv2(out)))
            out = self.bn3(self.conv3(out))
            x = self.shortcut(x)
            d = self.out_planes
            out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
            out = F.relu(out)
            return out

Beispiel #20

Datei anzeigen

Datei: xlm.py Projekt: timolegros/bittensor

class XLMSynapse(bittensor.synapse.Synapse):
    """A Bittensor Synapse training XLM 

    Args:
        synapse (:obj:`Synapse`): The Synapse superclass, which contains fwd and backward logic.

    """
    def __init__(self, config: Munch = None, **kwargs):
        """ Initialize a new XLM synapse module.

        Args:
            config (:obj:`munch.Munch`, `required`): 
                    munched config class.
        """
        super(XLMSynapse, self).__init__(config=config, **kwargs)
        if config == None:
            config = XLMSynapse.default_config()
        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        XLMSynapse.check_config(config)
        self.config = config

        # Build config.
        xlm_config = XLMConfig(
            vocab_size=bittensor.__vocab_size__,
            emb_dim=bittensor.__network_dim__,
            n_layers=config.synapse.n_layers,
            n_heads=config.synapse.n_heads,
            # More needed
        )

        # model layer: encodes tokenized sequences to network dim.
        self.xlm = XLMModel(xlm_config)

        # pooler layer: pools the hidden units for use by the pkm dendrite rpc query.
        self.pooler = XLMPooler(xlm_config)

        # router: (PKM layer) queries network using embeddings as context
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # hidden layer: transforms context and encoding to network dimension hidden units.
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # target layer: maps from hidden layer to vocab dimension for each token.
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__vocab_size__,
                                      bias=False)

        # Loss function
        self.loss_fct = nn.CrossEntropyLoss()

        self.to(self.device)

    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        XLMSynapse.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        """ Add custom params to the Synapse

        Args:
            parser (:obj:`argparse.AgumentParser`): Argument Parser object.

        """
        parser.add_argument(
            '--synapse.emb_dim',
            default=bittensor.__network_dim__,
            type=int,
            help='Dimensionality of the encoder layers and the pooler layer.')
        parser.add_argument(
            '--synapse.n_layers',
            default=12,
            type=int,
            help='Number of hidden layers in the Transformer encoder.')
        parser.add_argument(
            '--synapse.n_heads',
            default=16,
            type=int,
            help=
            'Number of attention heads for each attention layer in the Transformer encoder.'
        )
        parser.add_argument(
            '--synapse.dropout',
            default=0.1,
            type=float,
            help=
            'The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.'
        )
        parser.add_argument(
            '--synapse.attention_dropout',
            default=0.1,
            type=float,
            help='The dropout probability for the attention mechanism.')
        parser.add_argument(
            '--synapse.gelu_activation',
            default=True,
            type=bool,
            help=
            'Whether or not to use gelu for the activations instead of relu.')
        parser.add_argument(
            '--synapse.sinusoidal_embeddings',
            default=False,
            type=bool,
            help=
            'Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.'
        )
        parser.add_argument(
            '--synapse.causal',
            default=False,
            type=bool,
            help=
            'Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in order to only attend to the left-side context instead if a bidirectional context.'
        )
        parser.add_argument(
            '--synapse.asm',
            default=False,
            type=bool,
            help=
            'Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction layer.'
        )
        parser.add_argument(
            '--synapse.n_langs',
            default=1,
            type=int,
            help=
            'The number of languages the model handles. Set to 1 for monolingual models.'
        )
        parser.add_argument(
            '--synapse.use_lang_emb',
            default=True,
            type=bool,
            help=
            'Whether to use language embeddings. Some models use additional language embeddings, see the multilingual models page for information on how to use them.'
        )
        parser.add_argument(
            '--synapse.max_position_embeddings',
            default=512,
            type=bool,
            help=
            'The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).'
        )
        parser.add_argument(
            '--synapse.embed_init_std',
            default=pow(2048, -0.5),
            type=float,
            help=
            'The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.'
        )
        parser.add_argument(
            '--synapse.init_std',
            default=50257,
            type=int,
            help=
            'The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the embedding matrices.'
        )
        parser.add_argument(
            '--synapse.layer_norm_eps',
            default=pow(1, -12),
            type=float,
            help='The epsilon used by the layer normalization layers.')
        parser.add_argument(
            '--synapse.bos_index',
            default=0,
            type=int,
            help=
            'The index of the beginning of sentence token in the vocabulary.')
        parser.add_argument(
            '--synapse.eos_index',
            default=1,
            type=int,
            help='The index of the end of sentence token in the vocabulary.')
        parser.add_argument(
            '--synapse.pad_index',
            default=2,
            type=int,
            help='The index of the padding token in the vocabulary.')
        parser.add_argument(
            '--synapse.unk_index',
            default=3,
            type=int,
            help='The index of the unknown token in the vocabulary.')
        parser.add_argument(
            '--synapse.mask_index',
            default=5,
            type=int,
            help='The index of the masking token in the vocabulary.')
        parser.add_argument(
            '--synapse.is_encoder',
            default=True,
            type=bool,
            help=
            'Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.'
        )
        parser.add_argument(
            '--synapse.summary_type',
            default="first",
            type=str,
            help=
            'Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.'
        )
        parser.add_argument(
            '--synapse.summary_use_proj',
            default=True,
            type=bool,
            help=
            'Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. Whether or not to add a projection after the vector extraction.'
        )
        parser.add_argument(
            '--synapse.summary_activation',
            type=str,
            help=
            'Pass "tanh" for a tanh activation to the output, any other value will result in no activation.'
        )
        parser.add_argument(
            '--synapse.summary_proj_to_labels',
            default=True,
            type=bool,
            help=
            'Whether the projection outputs should have config.num_labels or config.hidden_size classes.'
        )
        parser.add_argument(
            '--synapse.summary_first_dropout',
            default=0.1,
            type=float,
            help=
            'The dropout ratio to be used after the projection and activation.'
        )
        parser.add_argument('--synapse.start_n_top',
                            default=5,
                            type=int,
                            help=' Used in the SQuAD evaluation script.')
        parser.add_argument('--synapse.end_n_top',
                            default=5,
                            type=int,
                            help='Used in the SQuAD evaluation script.')
        parser.add_argument(
            '--synapse.mask_token_id',
            default=0,
            type=int,
            help=
            'Model agnostic parameter to identify masked tokens when generating text in an MLM context.'
        )
        parser.add_argument(
            '--synapse.lang_id',
            default=1,
            type=int,
            help=
            'The ID of the language used by the model. This parameter is used when generating text in a given language.'
        )
        PKMRouter.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        assert config.synapse.n_layers > 0, "Number of hidden layers in the Transformer encoder must be > 0"
        assert config.synapse.n_heads > 0, "Number of attention heads for each attention layer in the Transformer encoder must be > 0"

    def forward_text(self, inputs: torch.LongTensor):
        """ Local forward inputs through the XLM Synapse.

        Args:
            inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of tokenized sentences.
            
            Returns:
                hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): 
                    Hidden layer representation produced using the local_context.
        """
        hidden = self.local_forward(inputs=inputs.to(self.device),
                                    training=False).local_hidden
        return hidden

    def local_forward(self,
                      inputs: torch.LongTensor,
                      training: bool = True) -> SimpleNamespace:
        """ Forward pass through XLM synapse.

            Args:
                inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of text sentences.

                training (:obj:`bool')`, `optional`, defaults to True):
                    Switch to True if this forward pass computes an CLM loss.

            SimpleNamespace {
                    local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`):
                        Hidden layer context.

                    local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`):
                        Hidden layer encoding produced using local_context.

                    local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__vocab_size__)`, `optional`):
                        XLM CLM Target predictions produced using local_context. 

                    local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        XLM CLM loss using local_context.
                }
        """

        # return variables to be filled.
        output = SimpleNamespace()

        # local_context: distilled version of remote context.
        # local_context.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.local_context = self.xlm(input_ids=inputs,
                                        return_dict=True).last_hidden_state

        # local_hidden: hidden layer encoding of sequence with local_context.
        # local_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.local_hidden = self.hidden_layer(output.local_context)

        if training:
            # local_target: projection of local_hidden onto target dimension.
            # local_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__]
            output.local_target = self.target_layer(output.local_hidden)

            # local_target_loss: XLM loss between local_target and ground truth targets (passed targets)
            shift_logits = output.local_target[..., :-1, :].contiguous()
            shift_labels = inputs[..., 1:].contiguous()
            output.local_target_loss = self.loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))

        return output

    def remote_forward(self, neuron: bittensor.neuron.Neuron,
                       inputs: torch.LongTensor,
                       training: bool) -> SimpleNamespace:
        """ Forward pass inputs and labels through the XLM module.


        Args:
            neuron (:obj: `bittensor.neuron.Neuron`, `required`):
                    Bittensor neuron, used for making queries to the remote network.

            inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): 
                    Batch_size length list of text sentences.

            training (:obj:`bool')`, `optional`, defaults to True):
                Switch to True if this forward pass computes a CLM loss.

        Returns:
            self.local_forward() + SimpleNamespace ( 

                    remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `optional`): 
                        Hidden layer encoding produced using the remote_context.

                    remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,  bittensor.__vocab_size__)`, `optional`):
                        XLM CLM Target predictions using the remote_context.

                    remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`):
                        XLM CLM loss using the remote_context.

                    distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): 
                        Distillation loss between local_context and remote_context.

                    router (:obj:`SimpleNamespace`, `required`): 
                        Outputs from the pkm dendrite.
            )
        """
        # Filter out of range tokens
        inputs = torch.clamp(inputs, 0, bittensor.__vocab_size__)

        # Run local model
        # output = SimpleNamespace
        output = self.local_forward(inputs, training)

        # pooled: pooled hidden layer from local run, used as our query context.
        # pooled.shape = [batch_size, bittensor.__network_dim__]
        pooled = self.pooler(output.local_hidden.detach())

        # remote_context: joined responses from a dendrite.forward_text call.
        # remote_context.shape = [batch_size, sequence_len, bittensor.__network_dim__]
        output.router = self.router.forward_text(neuron,
                                                 inputs.to(self.device),
                                                 pooled)
        remote_context = output.router.response

        # Distillation loss: distillation loss between local_context and remote_context
        # distillation_loss.shape = [1]
        output.distillation_loss = F.mse_loss(output.local_context,
                                              remote_context.detach())

        # remote_hidden: hidden layer encoding using remote_context.
        # remote_hidden.shape = [batch_size, sequence_length, bittensor.__network_dim__]
        output.remote_hidden = self.hidden_layer(remote_context)

        if training:
            # remote_target: projection of remote_hidden onto target dimension.
            # remote_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__]
            output.remote_target = self.target_layer(output.remote_hidden)

            # remote_target_loss: CLM oss between remote_target and passed targets.
            # remote_target_loss.shape = [1]
            shift_logits = output.remote_target[..., :-1, :].contiguous()
            shift_labels = inputs[..., 1:].contiguous()
            output.remote_target_loss = self.loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))

        return output