def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, max_memory_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, relative_encoding=False, block_position_encoding=False, output_predict=True, spell_length=None): super(GLMModel, self).__init__() self.parallel_output = parallel_output self.output_predict = output_predict self.hidden_size = hidden_size init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Transformer self.transformer = mpu.GPT2ParallelTransformer( num_layers, hidden_size, num_attention_heads, max_sequence_length, max_memory_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, relative_encoding=relative_encoding, block_position_encoding=block_position_encoding) if spell_length is not None: self.spell_length = spell_length self.spell_embeddings = torch.nn.Embedding(self.spell_length, self.hidden_size) self.lstm_head = torch.nn.LSTM( input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=2, # dropout=self.lstm_dropout, bidirectional=True, batch_first=True) # .to(torch.device("cuda")) self.mlp_head = torch.nn.Sequential( torch.nn.Linear(2 * self.hidden_size, self.hidden_size), torch.nn.ReLU(), torch.nn.Linear(self.hidden_size, self.hidden_size))
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, deepspeed_sparsity_config=None, sparse_mode=None): super(GPT2Model, self).__init__() self._conf_dict = conf_dict = { 'vocab_size': vocab_size, 'n_positions': max_sequence_length, 'n_ctx': max_sequence_length, 'n_embd': hidden_size, 'n_layer': num_layers, 'n_head': num_attention_heads } self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Initialize the position embeddings. init_method(self.position_embeddings.weight) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) # Transformer self.transformer = mpu.GPT2ParallelTransformer( num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, use_deepspeed_sparse=deepspeed_sparsity_config, sparse_mode=sparse_mode)
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True): super(GPT2Model, self).__init__() self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Token type embedding. # Add this as an optional field that can be added through # method call so we can load a pretrain model without # token types and add them as needed. self.tokentype_embeddings = None self.hidden_size = hidden_size # Initialize the position embeddings. init_method(self.position_embeddings.weight) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) # Transformer self.transformer = mpu.GPT2ParallelTransformer(num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers)
def __init__(self, num_layers, vocab_size, hidden_size, num_labels, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True): super(GPT2Model_C, self).__init__() self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # self.cls = mpu.VocabParallelEmbedding( # 1024, hidden_size, init_method=init_method) # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Initialize the position embeddings. init_method(self.position_embeddings.weight) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) # Transformer self.transformer = mpu.GPT2ParallelTransformer(num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers) self.linear = torch.nn.Linear(self.hidden_size, self.hidden_size) self.classifier = torch.nn.Linear(self.hidden_size, num_labels)
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, max_memory_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, output_predict=True): super(EncoderDecoder, self).__init__() self.parallel_output = parallel_output self.output_predict = output_predict init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Transformer self.encoder = mpu.GPT2ParallelTransformer( num_layers, hidden_size, num_attention_heads, max_sequence_length, max_memory_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers) self.decoder = mpu.GPT2ParallelTransformer(num_layers, hidden_size, num_attention_heads, max_sequence_length, max_memory_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, use_decoder_layer=True)
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, max_sequence_length, max_memory_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, relative_encoding=False): super(GPT2Model, self).__init__() self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Transformer self.transformer = mpu.GPT2ParallelTransformer( num_layers, hidden_size, num_attention_heads, max_sequence_length, max_memory_length, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, relative_encoding=relative_encoding)
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, max_sequence_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, num_experts=1, type_vocab_size=2): super(BertMixtureModel, self).__init__() self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Initialize the position embeddings. init_method(self.position_embeddings.weight) # Token Type Enbeddings. self.token_type_embeddings = torch.nn.Embedding( type_vocab_size, hidden_size) # Initialize the token type embeddings. init_method(self.token_type_embeddings.weight) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) self.hrs_head = RowParallelLinear(hidden_size, 1, input_is_parallel=True, init_method=init.xavier_normal_) init_method(self.hrs_head.weight) self.click_head = RowParallelLinear(hidden_size, 1, input_is_parallel=True, init_method=init.xavier_normal_) init_method(self.click_head.weight) self.lpsat_head = RowParallelLinear(hidden_size, 5, input_is_parallel=True, init_method=init.xavier_normal_) init_method(self.lpsat_head.weight) self.qc_head = RowParallelLinear(hidden_size, 5, input_is_parallel=True, init_method=init.xavier_normal_) init_method(self.qc_head.weight) self.eff_head = RowParallelLinear(hidden_size, 5, input_is_parallel=True, init_method=init.xavier_normal_) init_method(self.eff_head.weight) self.local_head = RowParallelLinear(hidden_size, 5, input_is_parallel=True, init_method=init.xavier_normal_) init_method(self.local_head.weight) self.fresh_head = RowParallelLinear(hidden_size, 5, input_is_parallel=True, init_method=init.xavier_normal_) init_method(self.fresh_head.weight) # Transformer self.transformer = mpu.BertParallelTransformer(num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, num_experts=num_experts) self.dropout = torch.nn.Dropout(output_dropout_prob) self.dense_hrs0 = ColumnParallelLinear(hidden_size, hidden_size, gather_output=False, init_method=init.xavier_normal_) self.dense_click0 = ColumnParallelLinear( hidden_size, hidden_size, gather_output=False, init_method=init.xavier_normal_) self.dense_lpsat0 = ColumnParallelLinear( hidden_size, hidden_size, gather_output=False, init_method=init.xavier_normal_) self.dense_qc0 = ColumnParallelLinear(hidden_size, hidden_size, gather_output=False, init_method=init.xavier_normal_) self.dense_eff0 = ColumnParallelLinear(hidden_size, hidden_size, gather_output=False, init_method=init.xavier_normal_) self.dense_local0 = ColumnParallelLinear( hidden_size, hidden_size, gather_output=False, init_method=init.xavier_normal_) self.dense_fresh0 = ColumnParallelLinear( hidden_size, hidden_size, gather_output=False, init_method=init.xavier_normal_)
def __init__(self, num_layers, vocab_size, hidden_size, num_attention_heads, embedding_dropout_prob, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, max_sequence_length, checkpoint_activations, checkpoint_num_layers=1, parallel_output=True, num_experts=1, type_vocab_size=2): super(BertMixtureModel_v0, self).__init__() self.parallel_output = parallel_output init_method = init_method_normal(std=0.02) # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, hidden_size, init_method=init_method) # Position embedding (serial). self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) # Initialize the position embeddings. init_method(self.position_embeddings.weight) # Token Type Enbeddings. self.token_type_embeddings = torch.nn.Embedding( type_vocab_size, hidden_size) # Initialize the token type embeddings. init_method(self.token_type_embeddings.weight) # Embeddings dropout self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) self.hrs_head = torch.nn.Linear(hidden_size, 1) init_method(self.hrs_head.weight) self.click_head = torch.nn.Linear(hidden_size, 1) init_method(self.click_head.weight) # Transformer self.transformer = mpu.BertParallelTransformer(num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers, num_experts=num_experts) self.dropout = torch.nn.Dropout(output_dropout_prob)