Python LayerSpec Examples, deepspeed.pipe.LayerSpec Python Examples

Example #1

0

Show file

File: simple_model.py Project: zhujiangang/DeepSpeed

    def __init__(self,
                 input_dim=128,
                 hidden_dim=128,
                 output_dim=128,
                 num_layers=4,
                 **kwargs):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        layers = []
        layers.append(
            LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim))
        for x in range(self.num_layers):
            layers.append(
                LayerSpec(torch.nn.Linear,
                          self.hidden_dim,
                          self.hidden_dim,
                          bias=False))
            layers.append(lambda x: x)
        layers.append(
            LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim))

        super().__init__(layers=layers,
                         loss_fn=torch.nn.CrossEntropyLoss(),
                         **kwargs)

Example #2

0

Show file

File: gpt_neox.py Project: glebshevchukk/gpt-neox

    def __init__(self,
                 *,
                 num_tokens,
                 dim,
                 seq_len,
                 depth,
                 loss_fn,
                 heads=8,
                 dim_head=64,
                 attn_dropout=0.,
                 ff_dropout=0.,
                 sparse_attn=False,
                 use_fused_layernorm=False,
                 tie_classifier_weights=False,
                 num_stages=2,
                 **kwargs):
        if not use_fused_layernorm:
            norm_class = nn.LayerNorm
        else:
            from apex.normalization import FusedLayerNorm
            norm_class = FusedLayerNorm

        self.seq_len = seq_len

        layers_sparse_attn = cast_tuple(sparse_attn, depth)

        #Build spec list
        #Input Embedding
        spec = [
            LayerSpec(EmbedBlock,
                      num_tokens=num_tokens,
                      dim=dim,
                      seq_len=seq_len)
        ]
        #Transformer layers
        for i in range(depth):
            spec.append(
                LayerSpec(TransformerBlock,
                          dim=dim,
                          seq_len=seq_len,
                          heads=heads,
                          dim_head=dim_head,
                          attn_dropout=attn_dropout,
                          ff_dropout=ff_dropout,
                          sparse_attn=layers_sparse_attn[i],
                          norm_class=norm_class))
        #Output norm and Linear
        spec += [
            LayerSpec(norm_class, dim),
            LayerSpec(nn.Linear, dim, num_tokens), lambda x: x.transpose(1, 2)
        ]
        print(spec)
        assert len(
            spec
        ) % num_stages == 0, f"for optimal performance, depth + 4 ({len(spec)}) should be divisible by the number of pipeline stages ({num_stages})"
        super().__init__(layers=spec,
                         loss_fn=loss_fn,
                         num_stages=num_stages,
                         **kwargs)

Example #3

0

Show file

File: megatron_model.py Project: szhengac/DeepSpeed

    def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
        from megatron.initialize import initialize_megatron

        args_defaults = {
            'vocab_file': get_test_path('gpt2-vocab.json'),
            'merge_file': get_test_path('gpt2-merges.txt'),
            'tokenizer_type': 'GPT2BPETokenizer',
        }

        args_defaults.update(args_others)

        # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
        sys.argv.extend([
            '--model-parallel-size',
            str(mp_size), '--make-vocab-size-divisible-by',
            str(1)
        ])

        initialize_megatron(args_defaults=args_defaults,
                            ignore_unknown_args=True)

        from megatron.model.transformer import ParallelTransformerLayer

        class ParallelTransformerLayerPipe(ParallelTransformerLayer):
            def forward(self, args):
                # hardcode attn mask for testing, PP requires the attn_mask to be stashed
                attention_mask = torch.tensor(
                    [[True]], device=torch.cuda.current_device())
                return super().forward(args, attention_mask)

        layers = []
        for x in range(num_layers):
            layers.append(
                LayerSpec(ParallelTransformerLayerPipe,
                          self.gpt2_attention_mask_func,
                          self.init_method_normal(0.02),
                          self.scaled_init_method_normal(0.02, num_layers), x))
        super().__init__(layers=layers,
                         loss_fn=torch.nn.CrossEntropyLoss(),
                         topology=topo,
                         **kwargs)

Example #4

0

Show file

File: megatron_model.py Project: wamartin-aml/DeepSpeed

    def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
        from megatron.initialize import initialize_megatron

        args_defaults = {
            'vocab_file': 'tests/unit/gpt2-vocab.json',
            'merge_file': 'tests/unit/gpt2-merges.txt',
            'tokenizer_type': 'GPT2BPETokenizer',
        }

        args_defaults.update(args_others)

        # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
        sys.argv.extend([
            '--model-parallel-size',
            str(mp_size),
            '--make-vocab-size-divisible-by',
            str(1)
        ])

        initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)

        from megatron.model.transformer import ParallelTransformerLayer

        class ParallelTransformerLayerPipe(ParallelTransformerLayer):
            def forward(self, args):
                hidden_states, attention_mask = args[0], args[1]
                return super().forward(*args), attention_mask

        layers = []
        for x in range(num_layers):
            layers.append(
                LayerSpec(ParallelTransformerLayerPipe,
                          self.gpt2_attention_mask_func,
                          self.init_method_normal(0.02),
                          self.scaled_init_method_normal(0.02,
                                                         num_layers),
                          x))
        super().__init__(layers=layers,
                         loss_fn=torch.nn.CrossEntropyLoss(),
                         topology=topo,
                         **kwargs)

Example #5

0

Show file

    def __init__(self, num_tokentypes=0, parallel_output=True, topology=None):
        args = get_args()

        self.parallel_output = parallel_output
        self.hidden_size = args.hidden_size
        self.num_tokentypes = num_tokentypes
        self.init_method = init_method_normal(args.init_method_std)
        self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers)
        weight_tying = not args.no_weight_tying
        if args.pos_emb == 'rpe':
            rpe_emb = ParallelRelativePositionBias(causal=True, num_buckets=args.rpe_num_buckets,
                                                   max_distance=args.rpe_max_distance,
                                                   heads=args.num_attention_heads)

        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy

        #
        # forward() prototype
        # 
        self.specs = []
        # Embedding layer
        if weight_tying:
            self.specs.append(TiedLayerSpec('embed',
                                            EmbeddingPipe,
                                            self.hidden_size,
                                            args.padded_vocab_size,
                                            args.max_position_embeddings,
                                            args.hidden_dropout,
                                            self.init_method,
                                            self.num_tokentypes,
                                            tied_weight_attr='word_embeddings_weight'))
        else:
            self.specs.append(LayerSpec(EmbeddingPipe,
                                        self.hidden_size,
                                        args.padded_vocab_size,
                                        args.max_position_embeddings,
                                        args.hidden_dropout,
                                        self.init_method,
                                        self.num_tokentypes))

        # outputs are now (hidden_states, attention_mask)
        # data format change to avoid explicit tranposes : [b s h] --> [s b h]
        self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous(), *x[1:]))
        # Transformer layers
        for x in range(args.num_layers):
            if args.sparsity == 'none':
                sparse = False
            elif args.sparsity == 'all':
                sparse = True
            elif args.sparsity == 'interspersed':
                sparse = not x % 2 == 0
            self.specs.append(
                LayerSpec(ParallelTransformerLayerPipe,
                          attention_mask_func=gpt2_attention_mask_func,
                          init_method=self.init_method,
                          output_layer_init_method=self.output_layer_init_method,
                          layer_number=x,
                          sparse=sparse,
                          rpe=rpe_emb if args.pos_emb == 'rpe' else None,
                          rotary=args.pos_emb == 'rotary'))
        # Undo data format change and drop mask
        self.specs.append(lambda x: x[0].transpose(0, 1).contiguous())

        # Final layernorm after transformer layers
        if args.norm == "rmsnorm":
            norm = RMSNorm
            eps = args.rms_norm_epsilon
        elif args.norm == "layernorm":
            eps = args.layernorm_epsilon
            norm = LayerNorm
        elif args.norm == "scalenorm":
            eps = args.scalenorm_epsilon
            norm = ScaleNorm

        self.specs.append(
            LayerSpec(norm,
                      args.hidden_size,
                      eps=eps))

        # XXX forward_method_parallel_output is assumed to be None, but we're not in a
        # fwd method to assert

        def _logits_helper(embedding, lm_output):
            """Just a wrapper to massage inputs/outputs from pipeline. """
            return parallel_lm_logits(
                lm_output,
                embedding.word_embeddings_weight,
                self.parallel_output)

        if weight_tying:
            self.specs.append(
                TiedLayerSpec('embed',
                              EmbeddingPipe,
                              self.hidden_size,
                              args.padded_vocab_size,
                              args.max_position_embeddings,
                              args.hidden_dropout,
                              self.init_method,
                              self.num_tokentypes,
                              forward_fn=_logits_helper,
                              tied_weight_attr='word_embeddings_weight')
            )
        else:
            # TODO: not sure whether to use RowParallelLinear's default scatter to mp region here, or copy, which is
            # the default of parallel_lm_logits. Should investigate benefits of both
            self.specs.append(
                LayerSpec(
                    mpu.RowParallelLinear,
                    args.hidden_size,
                    args.padded_vocab_size,
                    bias=False,
                    input_is_parallel=False,
                    parallel_output=self.parallel_output,
                    skip_bias_add=False
                )
            )
            self.specs.append(lambda x: x[0])  # drop bias

        loss_fn = partial(cross_entropy, _fp16=self.fp16_lm_cross_entropy)
        if args.checkpoint_activations:
            interval = args.checkpoint_num_layers
        else:
            interval = 0
        super().__init__(layers=self.specs,
                         loss_fn=loss_fn,
                         topology=topology,
                         activation_checkpoint_interval=interval,
                         partition_method=args.pipe_partition_method)  # 'type:transformer' / 'parameters'

Example #6

0

Show file

File: gpt2_model.py Project: zerojooon/DeepSpeedExamples

    def __init__(self,
                 num_tokentypes=0,
                 parallel_output=True,
                 add_pooler=False,
                 topology=None):
        args = get_args()

        self.parallel_output = parallel_output
        self.hidden_size = args.hidden_size
        self.num_tokentypes = num_tokentypes
        self.init_method = init_method_normal(args.init_method_std)
        self.output_layer_init_method = scaled_init_method_normal(
            args.init_method_std, args.num_layers)
        self.add_pooler = add_pooler
        if self.add_pooler:
            raise NotImplementedError(
                'Pipeline pooler not yet implemented. Forward needs pooling_sequence_index'
            )

        # Use torch gelu unless otherwise forced.
        gelu = F.gelu
        if args.openai_gelu:
            gelu = openai_gelu

        #
        # forward() prototype
        #
        self.specs = []

        # Embedding layer
        self.specs.append(
            TiedLayerSpec('embed',
                          EmbeddingPipe,
                          self.hidden_size,
                          args.padded_vocab_size,
                          args.max_position_embeddings,
                          args.hidden_dropout,
                          self.init_method,
                          self.num_tokentypes,
                          tied_weight_attr='word_embeddings_weight'))

        # outputs are now (hidden_states, attention_mask)

        # data format change to avoid explicit tranposes : [b s h] --> [s b h]
        self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous(), x[1]))

        # Transformer layers
        for x in range(args.num_layers):
            self.specs.append(
                LayerSpec(
                    ParallelTransformerLayerPipe,
                    attention_mask_func=gpt2_attention_mask_func,
                    init_method=self.init_method,
                    output_layer_init_method=self.output_layer_init_method,
                    layer_number=x))
        # Undo data format change and drop mask
        self.specs.append(lambda x: x[0].transpose(0, 1).contiguous())

        # Final layernorm after transformer layers
        self.specs.append(
            LayerSpec(LayerNorm, args.hidden_size, eps=args.layernorm_epsilon))

        # XXX forward_method_parallel_output is assumed to be None, but we're not in a
        # fwd method to assert

        def _logits_helper(embedding, lm_output):
            """Just a wrapper to massage inputs/outputs from pipeline. """
            return parallel_lm_logits(lm_output,
                                      embedding.word_embeddings_weight,
                                      self.parallel_output)

        self.specs.append(
            TiedLayerSpec('embed',
                          EmbeddingPipe,
                          self.hidden_size,
                          args.padded_vocab_size,
                          args.max_position_embeddings,
                          args.hidden_dropout,
                          self.init_method,
                          self.num_tokentypes,
                          forward_fn=_logits_helper,
                          tied_weight_attr='word_embeddings_weight'))

        # Should maybe be done in loss_fn() instead?
        if args.fp16:
            self.specs.append(fp16.fp16_to_fp32)

        if args.checkpoint_activations:
            interval = args.checkpoint_num_layers
        else:
            interval = 0
        super().__init__(layers=self.specs,
                         loss_fn=CrossEntropy,
                         topology=topology,
                         activation_checkpoint_interval=interval,
                         partition_method='type:transformer')

Example #7

0

Show file

File: gpt2_model.py Project: EleutherAI/megatron-3d

    def __init__(self, num_tokentypes=0, parallel_output=True, topology=None):
        args = get_args()

        self.parallel_output = parallel_output
        self.hidden_size = args.hidden_size
        self.num_tokentypes = num_tokentypes
        self.init_method = init_method_normal(args.init_method_std)
        self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers)

        # Use torch gelu unless otherwise forced.
        gelu = F.gelu
        if args.openai_gelu:
            gelu = openai_gelu

        #
        # forward() prototype
        # 
        self.specs = []
        weight_tying = not args.no_weight_tying
        # Embedding layer
        if weight_tying:
            self.specs.append(TiedLayerSpec('embed',
                                            EmbeddingPipe,
                                            self.hidden_size,
                                            args.padded_vocab_size,
                                            args.max_position_embeddings,
                                            args.hidden_dropout,
                                            self.init_method,
                                            self.num_tokentypes,
                                            args.sinusoidal_pos_emb,
                                            tied_weight_attr='word_embeddings_weight'))
        else:
            self.specs.append(LayerSpec(EmbeddingPipe,
                                self.hidden_size,
                                args.padded_vocab_size,
                                args.max_position_embeddings,
                                args.hidden_dropout,
                                self.init_method,
                                self.num_tokentypes,
                                args.sinusoidal_pos_emb))

        # outputs are now (hidden_states, attention_mask)

        # data format change to avoid explicit tranposes : [b s h] --> [s b h]
        self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous(), x[1]))
        # Transformer layers
        for x in range(args.num_layers):
            if args.sparsity == 'none':
                sparse = False
            elif args.sparsity == 'all':
                sparse = True
            elif args.sparsity == 'interspersed':
                sparse = not x % 2 == 0
            self.specs.append(
                LayerSpec(ParallelTransformerLayerPipe,
                          attention_mask_func=gpt2_attention_mask_func,
                          init_method=self.init_method,
                          output_layer_init_method=self.output_layer_init_method,
                          layer_number=x,
                          sparse=sparse))
        # Undo data format change and drop mask
        self.specs.append(lambda x: x[0].transpose(0, 1).contiguous())

        # Final layernorm after transformer layers
        self.specs.append(
            LayerSpec(LayerNorm,
                      args.hidden_size,
                      eps=args.layernorm_epsilon))

        # XXX forward_method_parallel_output is assumed to be None, but we're not in a
        # fwd method to assert

        def _logits_helper(embedding, lm_output):
            """Just a wrapper to massage inputs/outputs from pipeline. """
            return parallel_lm_logits(
                lm_output,
                embedding.word_embeddings_weight,
                self.parallel_output)
        
        if weight_tying:
            self.specs.append(
                TiedLayerSpec('embed',
                            EmbeddingPipe,
                            self.hidden_size,
                            args.padded_vocab_size,
                            args.max_position_embeddings,
                            args.hidden_dropout,
                            self.init_method,
                            self.num_tokentypes,
                            args.sinusoidal_pos_emb,
                            forward_fn=_logits_helper,
                            tied_weight_attr='word_embeddings_weight')
            )
        else:
            self.specs.append(
                LayerSpec(
                    mpu.RowParallelLinear,
                    args.hidden_size,
                    args.padded_vocab_size,
                    bias=False,
                    input_is_parallel=False,
                    parallel_output=True,
                    skip_bias_add=False
                )
            )
            self.specs.append(lambda x: x[0]) # drop bias


        # Should maybe be done in loss_fn() instead?
        if args.fp16:
            self.specs.append(fp16.fp16_to_fp32)

        if args.checkpoint_activations:
            interval = args.checkpoint_num_layers
        else:
            interval = 0
        super().__init__(layers=self.specs,
                         loss_fn=CrossEntropy,
                         topology=topology,
                         activation_checkpoint_interval=interval,
                         partition_method='type:transformer')

Example #8

0

Show file

File: gpt2_model.py Project: EleutherAI/gpt-neox

    def init_specs(self):

        weight_tying = not self.neox_args.no_weight_tying
        self.specs = []

        # Embedding layer
        # input will be (input_ids, position_ids, attention_mask)

        if weight_tying:
            self.specs.append(
                TiedLayerSpec(
                    "embed",
                    EmbeddingPipe,
                    self.neox_args,
                    self.hidden_size,
                    self.neox_args.padded_vocab_size,
                    self.neox_args.max_position_embeddings,
                    self.neox_args.hidden_dropout,
                    self.init_method,
                    self.num_tokentypes,
                    tied_weight_attr="word_embeddings_weight",
                ))
        else:
            self.specs.append(
                LayerSpec(
                    EmbeddingPipe,
                    self.neox_args,
                    self.hidden_size,
                    self.neox_args.padded_vocab_size,
                    self.neox_args.max_position_embeddings,
                    self.neox_args.hidden_dropout,
                    self.init_method,
                    self.num_tokentypes,
                ))

        # NB: the attention mask always needs to be the *last* item in the args when being passed from
        # one stage to the next, because deepspeed is hacks on top of hacks.
        #
        # outputs are now (hidden_states,  attention_mask)

        self.specs.append(_pre_transformer_block)

        # T5 RPE positional embedding
        if self.neox_args.pos_emb == "rpe":
            hidden_size_per_attention_head = mpu.divide(
                self.neox_args.hidden_size, self.neox_args.num_attention_heads)
            rpe_scale = math.sqrt(hidden_size_per_attention_head)
            rpe_emb = ParallelRelativePositionBias(
                neox_args=self.neox_args,
                scale=rpe_scale,
                causal=True,
                num_buckets=self.neox_args.rpe_num_buckets,
                max_distance=self.neox_args.rpe_max_distance,
                heads=self.neox_args.num_attention_heads,
            )

        # Transformer layers
        for i in range(self.neox_args.num_layers):
            layer_type = self.neox_args.attention_config[i]
            if layer_type in ["gmlp", "amlp"]:
                self.specs.append(
                    LayerSpec(
                        GMLPBlock,
                        init_method=self.init_method,
                        layer_number=i,
                        output_layer_init_method=self.output_layer_init_method,
                        neox_args=self.neox_args,
                        mask_fn=gpt2_attention_mask_func,
                    ))
            else:
                self.specs.append(
                    LayerSpec(
                        ParallelTransformerLayerPipe,
                        neox_args=self.neox_args,
                        attention_mask_func=gpt2_attention_mask_func,
                        init_method=self.init_method,
                        output_layer_init_method=self.output_layer_init_method,
                        layer_number=i,
                        rpe=rpe_emb
                        if self.neox_args.pos_emb == "rpe" else None,
                        rotary=self.neox_args.pos_emb == "rotary",
                        use_cache=self.use_cache,
                    ))

        # used to drop attention mask + reshape hidden states
        self.specs.append(_post_transformer_block)

        # NormPipe is a (deprecated) helper class that used to be used to pass presents along the pipeline - since presents are now cached to the `TransformerLayer` class this is no longer needed
        norm, eps = get_norm(self.neox_args)
        self.specs.append(
            LayerSpec(NormPipe, norm, self.neox_args.hidden_size, eps=eps))

        # outputs are now a single tensor: hidden_states

        def _logits_helper(embedding, lm_output):
            """Just a wrapper to massage inputs/outputs from pipeline."""
            logits = parallel_lm_logits(lm_output,
                                        embedding.word_embeddings_weight,
                                        self.parallel_output)
            return logits

        if weight_tying:
            self.specs.append(
                TiedLayerSpec(
                    "embed",
                    EmbeddingPipe,
                    self.neox_args,
                    self.hidden_size,
                    self.neox_args.padded_vocab_size,
                    self.neox_args.max_position_embeddings,
                    self.neox_args.hidden_dropout,
                    self.init_method,
                    self.num_tokentypes,
                    forward_fn=_logits_helper,
                    tied_weight_attr="word_embeddings_weight",
                ))
        else:
            self.specs.append(
                LayerSpec(
                    ParallelLinearPipe,
                    neox_args=self.neox_args,
                    init_method=self.init_method,
                    parallel_output=self.parallel_output,
                ))

Example #9

0

Show file

File: gpt2_model.py Project: ShivanshuPurohit/gpt-neox

    def init_specs(self):
        weight_tying = not self.neox_args.no_weight_tying
        if self.embedding_type == 'rpe':
            rpe_emb = ParallelRelativePositionBias(
                neox_args=self.neox_args,
                causal=True,
                num_buckets=self.neox_args.rpe_num_buckets,
                max_distance=self.neox_args.rpe_max_distance,
                heads=self.neox_args.num_attention_heads)
        self.specs = []
        # Embedding layer
        # input will be (input_ids, position_ids, attention_mask) in Training
        # and (input_ids, position_ids, attention_mask, layer_past) in Inference
        if weight_tying:
            self.specs.append(
                TiedLayerSpec('embed',
                              EmbeddingPipe,
                              self.neox_args,
                              self.hidden_size,
                              self.neox_args.padded_vocab_size,
                              self.neox_args.max_position_embeddings,
                              self.neox_args.hidden_dropout,
                              self.init_method,
                              self.num_tokentypes,
                              tied_weight_attr='word_embeddings_weight'))
        else:
            self.specs.append(
                LayerSpec(EmbeddingPipe, self.neox_args, self.hidden_size,
                          self.neox_args.padded_vocab_size,
                          self.neox_args.max_position_embeddings,
                          self.neox_args.hidden_dropout, self.init_method,
                          self.num_tokentypes))

        # NB: in inference, the attention mask always needs to be the *last* item in the args when being passed from
        # one stage to the next, because deepspeed is hacks on top of hacks.
        #
        # outputs are now
        #           Train: (hidden_states,  attention_mask)
        #           Inference: (hidden_states, layer_past, attention_mask)

        self.specs.append(_pre_transformer_block)

        # Transformer layers
        for i in range(self.neox_args.num_layers):
            layer_type = self.neox_args.attention_config[i]
            if layer_type in ["gmlp", "amlp"]:
                self.specs.append(
                    LayerSpec(
                        GMLPBlock,
                        init_method=self.init_method,
                        layer_number=i,
                        output_layer_init_method=self.output_layer_init_method,
                        neox_args=self.neox_args,
                        mask_fn=gpt2_attention_mask_func))
            else:
                self.specs.append(
                    LayerSpec(
                        ParallelTransformerLayerPipe,
                        neox_args=self.neox_args,
                        attention_mask_func=gpt2_attention_mask_func,
                        init_method=self.init_method,
                        output_layer_init_method=self.output_layer_init_method,
                        layer_number=i,
                        rpe=rpe_emb
                        if self.neox_args.pos_emb == 'rpe' else None,
                        rotary=self.neox_args.pos_emb == 'rotary',
                        get_key_value=self.get_key_value))

        self.specs.append(_post_transformer_block)

        # NormPipe is a helper class to pass presents through to the output when doing inference
        norm, eps = get_norm(self.neox_args)
        self.specs.append(
            LayerSpec(NormPipe, norm, self.neox_args.hidden_size, eps=eps))

        # outputs are now
        #           Train: hidden_states
        #           Inference: (hidden_states, presents)

        def _logits_helper(embedding, lm_output):
            """Just a wrapper to massage inputs/outputs from pipeline. """
            if self._inference and len(lm_output) == 2:
                hidden_states, presents = lm_output
                logits = parallel_lm_logits(hidden_states,
                                            embedding.word_embeddings_weight,
                                            self.parallel_output)
                return logits, presents
            else:
                logits = parallel_lm_logits(lm_output,
                                            embedding.word_embeddings_weight,
                                            self.parallel_output)
                return logits

        if weight_tying:
            self.specs.append(
                TiedLayerSpec('embed',
                              EmbeddingPipe,
                              self.neox_args,
                              self.hidden_size,
                              self.neox_args.padded_vocab_size,
                              self.neox_args.max_position_embeddings,
                              self.neox_args.hidden_dropout,
                              self.init_method,
                              self.num_tokentypes,
                              forward_fn=_logits_helper,
                              tied_weight_attr='word_embeddings_weight'))
        else:
            self.specs.append(
                LayerSpec(ParallelLinearPipe,
                          neox_args=self.neox_args,
                          init_method=self.init_method,
                          parallel_output=self.parallel_output))