def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, max_memory_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, relative_encoding=False, block_position_encoding=False, output_predict=True, spell_length=None): super(GLMModel, self).__init__() self.parallel_output = parallel_output self.output_predict = output_predict self.hidden_size = hidden_size init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Transformer self.transformer = mpu.GPT2ParallelTransformer( num_layers, hidden_size, num_attention_heads, max_sequence_length, max_memory_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, relative_encoding=relative_encoding, block_position_encoding=block_position_encoding) if spell_length is not None: self.spell_length = spell_length self.spell_embeddings = torch.nn.Embedding(self.spell_length, self.hidden_size) self.lstm_head = torch.nn.LSTM( input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=2, # dropout=self.lstm_dropout, bidirectional=True, batch_first=True) # .to(torch.device("cuda")) self.mlp_head = torch.nn.Sequential( torch.nn.Linear(2 * self.hidden_size, self.hidden_size), torch.nn.ReLU(), torch.nn.Linear(self.hidden_size, self.hidden_size))
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, deepspeed_sparsity_config=None, sparse_mode=None): super(GPT2Model, self).__init__() self._conf_dict = conf_dict = { 'vocab_size': vocab_size, 'n_positions': max_sequence_length, 'n_ctx': max_sequence_length, 'n_embd': hidden_size, 'n_layer': num_layers, 'n_head': num_attention_heads } self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Initialize the position embeddings. init_method(self.position_embeddings.weight) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) # Transformer self.transformer = mpu.GPT2ParallelTransformer( num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, use_deepspeed_sparse=deepspeed_sparsity_config, sparse_mode=sparse_mode)
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, max_memory_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, output_predict=True): super(EncoderDecoder, self).__init__() self.parallel_output = parallel_output self.output_predict = output_predict init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Transformer self.encoder = mpu.GPT2ParallelTransformer( num_layers, hidden_size, num_attention_heads, max_sequence_length, max_memory_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers) self.decoder = mpu.GPT2ParallelTransformer(num_layers, hidden_size, num_attention_heads, max_sequence_length, max_memory_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, use_decoder_layer=True)
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True): super(GPT2Model, self).__init__() self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Token type embedding. # Add this as an optional field that can be added through # method call so we can load a pretrain model without # token types and add them as needed. self.tokentype_embeddings = None self.hidden_size = hidden_size # Initialize the position embeddings. init_method(self.position_embeddings.weight) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) # Transformer self.transformer = mpu.GPT2ParallelTransformer(num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers)
def __init__(self, num_layers, vocab_size, hidden_size, num_labels, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True): super(GPT2Model_C, self).__init__() self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # self.cls = mpu.VocabParallelEmbedding( # 1024, hidden_size, init_method=init_method) # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Initialize the position embeddings. init_method(self.position_embeddings.weight) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) # Transformer self.transformer = mpu.GPT2ParallelTransformer(num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers) self.linear = torch.nn.Linear(self.hidden_size, self.hidden_size) self.classifier = torch.nn.Linear(self.hidden_size, num_labels)
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, max_memory_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, relative_encoding=False): super(GPT2Model, self).__init__() self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Transformer self.transformer = mpu.GPT2ParallelTransformer( num_layers, hidden_size, num_attention_heads, max_sequence_length, max_memory_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, relative_encoding=relative_encoding)
def configure_transformer(config): transformer = mpu.GPT2ParallelTransformer(config.n_layer, config.n_positions, config.n_head, config.attn_pdrop, config.resid_pdrop, True, 1) return transformer