def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4, **kwargs): self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim self.num_layers = num_layers layers = [] layers.append( LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim)) for x in range(self.num_layers): layers.append( LayerSpec(torch.nn.Linear, self.hidden_dim, self.hidden_dim, bias=False)) layers.append(lambda x: x) layers.append( LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim)) super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), **kwargs)
def __init__(self, *, num_tokens, dim, seq_len, depth, loss_fn, heads=8, dim_head=64, attn_dropout=0., ff_dropout=0., sparse_attn=False, use_fused_layernorm=False, tie_classifier_weights=False, num_stages=2, **kwargs): if not use_fused_layernorm: norm_class = nn.LayerNorm else: from apex.normalization import FusedLayerNorm norm_class = FusedLayerNorm self.seq_len = seq_len layers_sparse_attn = cast_tuple(sparse_attn, depth) #Build spec list #Input Embedding spec = [ LayerSpec(EmbedBlock, num_tokens=num_tokens, dim=dim, seq_len=seq_len) ] #Transformer layers for i in range(depth): spec.append( LayerSpec(TransformerBlock, dim=dim, seq_len=seq_len, heads=heads, dim_head=dim_head, attn_dropout=attn_dropout, ff_dropout=ff_dropout, sparse_attn=layers_sparse_attn[i], norm_class=norm_class)) #Output norm and Linear spec += [ LayerSpec(norm_class, dim), LayerSpec(nn.Linear, dim, num_tokens), lambda x: x.transpose(1, 2) ] print(spec) assert len( spec ) % num_stages == 0, f"for optimal performance, depth + 4 ({len(spec)}) should be divisible by the number of pipeline stages ({num_stages})" super().__init__(layers=spec, loss_fn=loss_fn, num_stages=num_stages, **kwargs)
def __init__(self, num_layers, mp_size, args_others, topo, **kwargs): from megatron.initialize import initialize_megatron args_defaults = { 'vocab_file': get_test_path('gpt2-vocab.json'), 'merge_file': get_test_path('gpt2-merges.txt'), 'tokenizer_type': 'GPT2BPETokenizer', } args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. sys.argv.extend([ '--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1) ]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) from megatron.model.transformer import ParallelTransformerLayer class ParallelTransformerLayerPipe(ParallelTransformerLayer): def forward(self, args): # hardcode attn mask for testing, PP requires the attn_mask to be stashed attention_mask = torch.tensor( [[True]], device=torch.cuda.current_device()) return super().forward(args, attention_mask) layers = [] for x in range(num_layers): layers.append( LayerSpec(ParallelTransformerLayerPipe, self.gpt2_attention_mask_func, self.init_method_normal(0.02), self.scaled_init_method_normal(0.02, num_layers), x)) super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), topology=topo, **kwargs)
def __init__(self, num_layers, mp_size, args_others, topo, **kwargs): from megatron.initialize import initialize_megatron args_defaults = { 'vocab_file': 'tests/unit/gpt2-vocab.json', 'merge_file': 'tests/unit/gpt2-merges.txt', 'tokenizer_type': 'GPT2BPETokenizer', } args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. sys.argv.extend([ '--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1) ]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) from megatron.model.transformer import ParallelTransformerLayer class ParallelTransformerLayerPipe(ParallelTransformerLayer): def forward(self, args): hidden_states, attention_mask = args[0], args[1] return super().forward(*args), attention_mask layers = [] for x in range(num_layers): layers.append( LayerSpec(ParallelTransformerLayerPipe, self.gpt2_attention_mask_func, self.init_method_normal(0.02), self.scaled_init_method_normal(0.02, num_layers), x)) super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), topology=topo, **kwargs)
def __init__(self, num_tokentypes=0, parallel_output=True, topology=None): args = get_args() self.parallel_output = parallel_output self.hidden_size = args.hidden_size self.num_tokentypes = num_tokentypes self.init_method = init_method_normal(args.init_method_std) self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) weight_tying = not args.no_weight_tying if args.pos_emb == 'rpe': rpe_emb = ParallelRelativePositionBias(causal=True, num_buckets=args.rpe_num_buckets, max_distance=args.rpe_max_distance, heads=args.num_attention_heads) self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy # # forward() prototype # self.specs = [] # Embedding layer if weight_tying: self.specs.append(TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, tied_weight_attr='word_embeddings_weight')) else: self.specs.append(LayerSpec(EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes)) # outputs are now (hidden_states, attention_mask) # data format change to avoid explicit tranposes : [b s h] --> [s b h] self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous(), *x[1:])) # Transformer layers for x in range(args.num_layers): if args.sparsity == 'none': sparse = False elif args.sparsity == 'all': sparse = True elif args.sparsity == 'interspersed': sparse = not x % 2 == 0 self.specs.append( LayerSpec(ParallelTransformerLayerPipe, attention_mask_func=gpt2_attention_mask_func, init_method=self.init_method, output_layer_init_method=self.output_layer_init_method, layer_number=x, sparse=sparse, rpe=rpe_emb if args.pos_emb == 'rpe' else None, rotary=args.pos_emb == 'rotary')) # Undo data format change and drop mask self.specs.append(lambda x: x[0].transpose(0, 1).contiguous()) # Final layernorm after transformer layers if args.norm == "rmsnorm": norm = RMSNorm eps = args.rms_norm_epsilon elif args.norm == "layernorm": eps = args.layernorm_epsilon norm = LayerNorm elif args.norm == "scalenorm": eps = args.scalenorm_epsilon norm = ScaleNorm self.specs.append( LayerSpec(norm, args.hidden_size, eps=eps)) # XXX forward_method_parallel_output is assumed to be None, but we're not in a # fwd method to assert def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline. """ return parallel_lm_logits( lm_output, embedding.word_embeddings_weight, self.parallel_output) if weight_tying: self.specs.append( TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, forward_fn=_logits_helper, tied_weight_attr='word_embeddings_weight') ) else: # TODO: not sure whether to use RowParallelLinear's default scatter to mp region here, or copy, which is # the default of parallel_lm_logits. Should investigate benefits of both self.specs.append( LayerSpec( mpu.RowParallelLinear, args.hidden_size, args.padded_vocab_size, bias=False, input_is_parallel=False, parallel_output=self.parallel_output, skip_bias_add=False ) ) self.specs.append(lambda x: x[0]) # drop bias loss_fn = partial(cross_entropy, _fp16=self.fp16_lm_cross_entropy) if args.checkpoint_activations: interval = args.checkpoint_num_layers else: interval = 0 super().__init__(layers=self.specs, loss_fn=loss_fn, topology=topology, activation_checkpoint_interval=interval, partition_method=args.pipe_partition_method) # 'type:transformer' / 'parameters'
def __init__(self, num_tokentypes=0, parallel_output=True, add_pooler=False, topology=None): args = get_args() self.parallel_output = parallel_output self.hidden_size = args.hidden_size self.num_tokentypes = num_tokentypes self.init_method = init_method_normal(args.init_method_std) self.output_layer_init_method = scaled_init_method_normal( args.init_method_std, args.num_layers) self.add_pooler = add_pooler if self.add_pooler: raise NotImplementedError( 'Pipeline pooler not yet implemented. Forward needs pooling_sequence_index' ) # Use torch gelu unless otherwise forced. gelu = F.gelu if args.openai_gelu: gelu = openai_gelu # # forward() prototype # self.specs = [] # Embedding layer self.specs.append( TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, tied_weight_attr='word_embeddings_weight')) # outputs are now (hidden_states, attention_mask) # data format change to avoid explicit tranposes : [b s h] --> [s b h] self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous(), x[1])) # Transformer layers for x in range(args.num_layers): self.specs.append( LayerSpec( ParallelTransformerLayerPipe, attention_mask_func=gpt2_attention_mask_func, init_method=self.init_method, output_layer_init_method=self.output_layer_init_method, layer_number=x)) # Undo data format change and drop mask self.specs.append(lambda x: x[0].transpose(0, 1).contiguous()) # Final layernorm after transformer layers self.specs.append( LayerSpec(LayerNorm, args.hidden_size, eps=args.layernorm_epsilon)) # XXX forward_method_parallel_output is assumed to be None, but we're not in a # fwd method to assert def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline. """ return parallel_lm_logits(lm_output, embedding.word_embeddings_weight, self.parallel_output) self.specs.append( TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, forward_fn=_logits_helper, tied_weight_attr='word_embeddings_weight')) # Should maybe be done in loss_fn() instead? if args.fp16: self.specs.append(fp16.fp16_to_fp32) if args.checkpoint_activations: interval = args.checkpoint_num_layers else: interval = 0 super().__init__(layers=self.specs, loss_fn=CrossEntropy, topology=topology, activation_checkpoint_interval=interval, partition_method='type:transformer')
def __init__(self, num_tokentypes=0, parallel_output=True, topology=None): args = get_args() self.parallel_output = parallel_output self.hidden_size = args.hidden_size self.num_tokentypes = num_tokentypes self.init_method = init_method_normal(args.init_method_std) self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) # Use torch gelu unless otherwise forced. gelu = F.gelu if args.openai_gelu: gelu = openai_gelu # # forward() prototype # self.specs = [] weight_tying = not args.no_weight_tying # Embedding layer if weight_tying: self.specs.append(TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, args.sinusoidal_pos_emb, tied_weight_attr='word_embeddings_weight')) else: self.specs.append(LayerSpec(EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, args.sinusoidal_pos_emb)) # outputs are now (hidden_states, attention_mask) # data format change to avoid explicit tranposes : [b s h] --> [s b h] self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous(), x[1])) # Transformer layers for x in range(args.num_layers): if args.sparsity == 'none': sparse = False elif args.sparsity == 'all': sparse = True elif args.sparsity == 'interspersed': sparse = not x % 2 == 0 self.specs.append( LayerSpec(ParallelTransformerLayerPipe, attention_mask_func=gpt2_attention_mask_func, init_method=self.init_method, output_layer_init_method=self.output_layer_init_method, layer_number=x, sparse=sparse)) # Undo data format change and drop mask self.specs.append(lambda x: x[0].transpose(0, 1).contiguous()) # Final layernorm after transformer layers self.specs.append( LayerSpec(LayerNorm, args.hidden_size, eps=args.layernorm_epsilon)) # XXX forward_method_parallel_output is assumed to be None, but we're not in a # fwd method to assert def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline. """ return parallel_lm_logits( lm_output, embedding.word_embeddings_weight, self.parallel_output) if weight_tying: self.specs.append( TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, args.sinusoidal_pos_emb, forward_fn=_logits_helper, tied_weight_attr='word_embeddings_weight') ) else: self.specs.append( LayerSpec( mpu.RowParallelLinear, args.hidden_size, args.padded_vocab_size, bias=False, input_is_parallel=False, parallel_output=True, skip_bias_add=False ) ) self.specs.append(lambda x: x[0]) # drop bias # Should maybe be done in loss_fn() instead? if args.fp16: self.specs.append(fp16.fp16_to_fp32) if args.checkpoint_activations: interval = args.checkpoint_num_layers else: interval = 0 super().__init__(layers=self.specs, loss_fn=CrossEntropy, topology=topology, activation_checkpoint_interval=interval, partition_method='type:transformer')
def init_specs(self): weight_tying = not self.neox_args.no_weight_tying self.specs = [] # Embedding layer # input will be (input_ids, position_ids, attention_mask) if weight_tying: self.specs.append( TiedLayerSpec( "embed", EmbeddingPipe, self.neox_args, self.hidden_size, self.neox_args.padded_vocab_size, self.neox_args.max_position_embeddings, self.neox_args.hidden_dropout, self.init_method, self.num_tokentypes, tied_weight_attr="word_embeddings_weight", )) else: self.specs.append( LayerSpec( EmbeddingPipe, self.neox_args, self.hidden_size, self.neox_args.padded_vocab_size, self.neox_args.max_position_embeddings, self.neox_args.hidden_dropout, self.init_method, self.num_tokentypes, )) # NB: the attention mask always needs to be the *last* item in the args when being passed from # one stage to the next, because deepspeed is hacks on top of hacks. # # outputs are now (hidden_states, attention_mask) self.specs.append(_pre_transformer_block) # T5 RPE positional embedding if self.neox_args.pos_emb == "rpe": hidden_size_per_attention_head = mpu.divide( self.neox_args.hidden_size, self.neox_args.num_attention_heads) rpe_scale = math.sqrt(hidden_size_per_attention_head) rpe_emb = ParallelRelativePositionBias( neox_args=self.neox_args, scale=rpe_scale, causal=True, num_buckets=self.neox_args.rpe_num_buckets, max_distance=self.neox_args.rpe_max_distance, heads=self.neox_args.num_attention_heads, ) # Transformer layers for i in range(self.neox_args.num_layers): layer_type = self.neox_args.attention_config[i] if layer_type in ["gmlp", "amlp"]: self.specs.append( LayerSpec( GMLPBlock, init_method=self.init_method, layer_number=i, output_layer_init_method=self.output_layer_init_method, neox_args=self.neox_args, mask_fn=gpt2_attention_mask_func, )) else: self.specs.append( LayerSpec( ParallelTransformerLayerPipe, neox_args=self.neox_args, attention_mask_func=gpt2_attention_mask_func, init_method=self.init_method, output_layer_init_method=self.output_layer_init_method, layer_number=i, rpe=rpe_emb if self.neox_args.pos_emb == "rpe" else None, rotary=self.neox_args.pos_emb == "rotary", use_cache=self.use_cache, )) # used to drop attention mask + reshape hidden states self.specs.append(_post_transformer_block) # NormPipe is a (deprecated) helper class that used to be used to pass presents along the pipeline - since presents are now cached to the `TransformerLayer` class this is no longer needed norm, eps = get_norm(self.neox_args) self.specs.append( LayerSpec(NormPipe, norm, self.neox_args.hidden_size, eps=eps)) # outputs are now a single tensor: hidden_states def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline.""" logits = parallel_lm_logits(lm_output, embedding.word_embeddings_weight, self.parallel_output) return logits if weight_tying: self.specs.append( TiedLayerSpec( "embed", EmbeddingPipe, self.neox_args, self.hidden_size, self.neox_args.padded_vocab_size, self.neox_args.max_position_embeddings, self.neox_args.hidden_dropout, self.init_method, self.num_tokentypes, forward_fn=_logits_helper, tied_weight_attr="word_embeddings_weight", )) else: self.specs.append( LayerSpec( ParallelLinearPipe, neox_args=self.neox_args, init_method=self.init_method, parallel_output=self.parallel_output, ))
def init_specs(self): weight_tying = not self.neox_args.no_weight_tying if self.embedding_type == 'rpe': rpe_emb = ParallelRelativePositionBias( neox_args=self.neox_args, causal=True, num_buckets=self.neox_args.rpe_num_buckets, max_distance=self.neox_args.rpe_max_distance, heads=self.neox_args.num_attention_heads) self.specs = [] # Embedding layer # input will be (input_ids, position_ids, attention_mask) in Training # and (input_ids, position_ids, attention_mask, layer_past) in Inference if weight_tying: self.specs.append( TiedLayerSpec('embed', EmbeddingPipe, self.neox_args, self.hidden_size, self.neox_args.padded_vocab_size, self.neox_args.max_position_embeddings, self.neox_args.hidden_dropout, self.init_method, self.num_tokentypes, tied_weight_attr='word_embeddings_weight')) else: self.specs.append( LayerSpec(EmbeddingPipe, self.neox_args, self.hidden_size, self.neox_args.padded_vocab_size, self.neox_args.max_position_embeddings, self.neox_args.hidden_dropout, self.init_method, self.num_tokentypes)) # NB: in inference, the attention mask always needs to be the *last* item in the args when being passed from # one stage to the next, because deepspeed is hacks on top of hacks. # # outputs are now # Train: (hidden_states, attention_mask) # Inference: (hidden_states, layer_past, attention_mask) self.specs.append(_pre_transformer_block) # Transformer layers for i in range(self.neox_args.num_layers): layer_type = self.neox_args.attention_config[i] if layer_type in ["gmlp", "amlp"]: self.specs.append( LayerSpec( GMLPBlock, init_method=self.init_method, layer_number=i, output_layer_init_method=self.output_layer_init_method, neox_args=self.neox_args, mask_fn=gpt2_attention_mask_func)) else: self.specs.append( LayerSpec( ParallelTransformerLayerPipe, neox_args=self.neox_args, attention_mask_func=gpt2_attention_mask_func, init_method=self.init_method, output_layer_init_method=self.output_layer_init_method, layer_number=i, rpe=rpe_emb if self.neox_args.pos_emb == 'rpe' else None, rotary=self.neox_args.pos_emb == 'rotary', get_key_value=self.get_key_value)) self.specs.append(_post_transformer_block) # NormPipe is a helper class to pass presents through to the output when doing inference norm, eps = get_norm(self.neox_args) self.specs.append( LayerSpec(NormPipe, norm, self.neox_args.hidden_size, eps=eps)) # outputs are now # Train: hidden_states # Inference: (hidden_states, presents) def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline. """ if self._inference and len(lm_output) == 2: hidden_states, presents = lm_output logits = parallel_lm_logits(hidden_states, embedding.word_embeddings_weight, self.parallel_output) return logits, presents else: logits = parallel_lm_logits(lm_output, embedding.word_embeddings_weight, self.parallel_output) return logits if weight_tying: self.specs.append( TiedLayerSpec('embed', EmbeddingPipe, self.neox_args, self.hidden_size, self.neox_args.padded_vocab_size, self.neox_args.max_position_embeddings, self.neox_args.hidden_dropout, self.init_method, self.num_tokentypes, forward_fn=_logits_helper, tied_weight_attr='word_embeddings_weight')) else: self.specs.append( LayerSpec(ParallelLinearPipe, neox_args=self.neox_args, init_method=self.init_method, parallel_output=self.parallel_output))