def initialize_word_embeddings(self, init_method_normal): args = get_args() if not self.share_word_embeddings: raise Exception('initialize_word_embeddings() was called but ' 'share_word_embeddings is false') # Parameters are shared between the word embeddings layer, and the # heads at the end of the model. In a pipelined setup with more than # one stage, the initial embedding layer and the head are on different # workers, so we do the following: # 1. Create a second copy of word_embeddings on the last stage, with # initial parameters of 0.0. # 2. Do an all-reduce between the first and last stage to ensure that # the two copies of word_embeddings start off with the same # parameter values. # 3. In the training loop, before an all-reduce between the grads of # the two word_embeddings layers to ensure that every applied weight # update is the same on both stages. if mpu.is_pipeline_last_stage(): if not mpu.is_pipeline_first_stage(): self._word_embeddings_for_head_key = 'word_embeddings_for_head' # If first and last stages are different, set word_embeddings # weights to 0 here, then copy first stage's weights using # all_reduce below. self.word_embeddings = mpu.VocabParallelEmbedding( args.padded_vocab_size, args.hidden_size, init_method=init_method_normal(args.init_method_std)) self.word_embeddings.weight.data.fill_(0) self.word_embeddings.weight.shared = True # Ensure that first and last stages have the same initial parameter # values. if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage(): torch.distributed.all_reduce(self.word_embeddings_weight().data, group=mpu.get_embedding_group())
def __init__(self, neox_args, hidden_size, vocab_size, max_sequence_length, embedding_dropout_prob, init_method, num_tokentypes=0, use_pos_emb=True): super(Embedding, self).__init__() self.hidden_size = hidden_size self.init_method = init_method self.num_tokentypes = num_tokentypes # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( neox_args=neox_args, num_embeddings=vocab_size, embedding_dim=self.hidden_size, init_method=self.init_method) self._word_embeddings_key = 'word_embeddings' # Position embedding (serial). self.use_pos_emb = use_pos_emb if self.use_pos_emb: self.embedding_type = neox_args.pos_emb if self.embedding_type == "learned": self.position_embeddings = torch.nn.Embedding( max_sequence_length, self.hidden_size) self._position_embeddings_key = 'position_embeddings' # Initialize the position embeddings. self.init_method(self.position_embeddings.weight) elif self.embedding_type == "sinusoidal": self.position_embeddings = SinusoidalPositionalEmbedding( self.hidden_size) # Token type embedding. # Add this as an optional field that can be added through # method call so we can load a pretrain model without # token types and add them as needed. self._tokentype_embeddings_key = 'tokentype_embeddings' if self.num_tokentypes > 0: self.tokentype_embeddings = torch.nn.Embedding( self.num_tokentypes, self.hidden_size) # Initialize the token-type embeddings. self.init_method(self.tokentype_embeddings.weight) else: self.tokentype_embeddings = None # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
def __init__(self, hidden_size, vocab_size, max_sequence_length, embedding_dropout_prob, init_method, rotary_pos_emb=False, num_tokentypes=0): super(Embedding, self).__init__() args = get_args() self.hidden_size = hidden_size self.init_method = init_method self.num_tokentypes = num_tokentypes # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, self.hidden_size, init_method=self.init_method) self._word_embeddings_key = 'word_embeddings' # Position embedding (serial). self.embedding_type = args.pos_emb if self.embedding_type == "learned": self.position_embeddings = torch.nn.Embedding( max_sequence_length, self.hidden_size) self._position_embeddings_key = 'position_embeddings' # Initialize the position embeddings. self.init_method(self.position_embeddings.weight) elif self.embedding_type == "sinusoidal": self.position_embeddings = SinusoidalPositionalEmbedding( self.hidden_size) elif self.embedding_type == 'rotary': hidden_size_per_attention_head = mpu.divide( args.hidden_size, args.num_attention_heads) self.rotary_pos_emb = SinusoidalPositionalEmbedding( hidden_size_per_attention_head) # Token type embedding. # Add this as an optional field that can be added through # method call so we can load a pretrain model without # token types and add them as needed. self._tokentype_embeddings_key = 'tokentype_embeddings' if self.num_tokentypes > 0: self.tokentype_embeddings = torch.nn.Embedding( self.num_tokentypes, self.hidden_size) # Initialize the token-type embeddings. self.init_method(self.tokentype_embeddings.weight) else: self.tokentype_embeddings = None # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
def initialize_word_embeddings(self, init_method_normal): args = get_args() if not self.share_word_embeddings: raise Exception('initialize_word_embeddings() was called but ' 'share_word_embeddings is false') # This function just initializes the word embeddings in the final stage # when we are using pipeline parallelism. If we aren't using pipeline # parallelism there is nothing to do. if args.pipeline_model_parallel_size == 1: return # Parameters are shared between the word embeddings layer, and the # heads at the end of the model. In a pipelined setup with more than # one stage, the initial embedding layer and the head are on different # workers, so we do the following: # 1. Create a second copy of word_embeddings on the last stage, with # initial parameters of 0.0. # 2. Do an all-reduce between the first and last stage to ensure that # the two copies of word_embeddings start off with the same # parameter values. # 3. In the training loop, before an all-reduce between the grads of # the two word_embeddings layers to ensure that every applied weight # update is the same on both stages. if mpu.is_pipeline_last_stage(): assert not mpu.is_pipeline_first_stage() self._word_embeddings_for_head_key = 'word_embeddings_for_head' # set word_embeddings weights to 0 here, then copy first # stage's weights using all_reduce below. self.word_embeddings = mpu.VocabParallelEmbedding( args.padded_vocab_size, args.hidden_size, init_method=init_method_normal(args.init_method_std)) self.word_embeddings.weight.data.fill_(0) self.word_embeddings.weight.shared = True # Ensure that first and last stages have the same initial parameter # values. if torch.distributed.is_initialized(): if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage(): torch.distributed.all_reduce( self.word_embeddings_weight().data, group=mpu.get_embedding_group()) else: print("WARNING! Distributed processes aren't initialized, so " "word embeddings in the last layer are not initialized. " "If you are just manipulating a model this is fine, but " "this needs to be handled manually. If you are training " "something is definitely wrong.")
def __init__(self, hidden_size, vocab_size, max_sequence_length, embedding_dropout_prob, init_method, num_tokentypes=0, scattered_embeddings=False): super(Embedding, self).__init__() self.hidden_size = hidden_size self.init_method = init_method self.num_tokentypes = num_tokentypes self.scattered_embeddings = scattered_embeddings # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, self.hidden_size, init_method=self.init_method) self._word_embeddings_key = 'word_embeddings' # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size) self._position_embeddings_key = 'position_embeddings' with deepspeed.zero.GatheredParameters(self.position_embeddings.weight, modifier_rank=0): # Initialize the position embeddings. self.init_method(self.position_embeddings.weight) # Token type embedding. # Add this as an optional field that can be added through # method call so we can load a pretrain model without # token types and add them as needed. self._tokentype_embeddings_key = 'tokentype_embeddings' if self.num_tokentypes > 0: self.tokentype_embeddings = torch.nn.Embedding( self.num_tokentypes, self.hidden_size) with deepspeed.zero.GatheredParameters( self.tokentype_embeddings.weight, modifier_rank=0): # Initialize the token-type embeddings. self.init_method(self.tokentype_embeddings.weight) else: self.tokentype_embeddings = None # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
def __init__( self, neox_args, hidden_size, vocab_size, max_sequence_length, embedding_dropout_prob, init_method, num_tokentypes=0, use_pos_emb=True, ): super(Embedding, self).__init__() self.hidden_size = hidden_size self.init_method = init_method self.num_tokentypes = num_tokentypes # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( neox_args=neox_args, num_embeddings=vocab_size, embedding_dim=self.hidden_size, init_method=self.init_method, ) self._word_embeddings_key = "word_embeddings" if neox_args.use_bnb_optimizer: try: import bitsandbytes as bnb self.embedding_module = bnb.nn.StableEmbedding except ModuleNotFoundError: print( "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes." ) raise Exception else: self.embedding_module = torch.nn.Embedding # Position embedding (serial). self.use_pos_emb = use_pos_emb if self.use_pos_emb: self.embedding_type = neox_args.pos_emb if self.embedding_type == "learned": self.position_embeddings = self.embedding_module( max_sequence_length, self.hidden_size) self._position_embeddings_key = "position_embeddings" # Initialize the position embeddings. self.init_method(self.position_embeddings.weight) elif self.embedding_type == "sinusoidal": self.position_embeddings = SinusoidalPositionalEmbedding( self.hidden_size) # Token type embedding. # Add this as an optional field that can be added through # method call so we can load a pretrain model without # token types and add them as needed. self._tokentype_embeddings_key = "tokentype_embeddings" if self.num_tokentypes > 0: self.tokentype_embeddings = self.embedding_module( self.num_tokentypes, self.hidden_size) # Initialize the token-type embeddings. self.init_method(self.tokentype_embeddings.weight) else: self.tokentype_embeddings = None # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) self.opt_pos_emb_offset = neox_args.opt_pos_emb_offset # For ticking position ids forward self.layer_past = None