Esempio n. 1
0
    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 max_memory_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 relative_encoding=False,
                 block_position_encoding=False,
                 output_predict=True,
                 spell_length=None):

        super(GLMModel, self).__init__()

        self.parallel_output = parallel_output
        self.output_predict = output_predict
        self.hidden_size = hidden_size

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(
            num_layers,
            hidden_size,
            num_attention_heads,
            max_sequence_length,
            max_memory_length,
            embedding_dropout_prob,
            attention_dropout_prob,
            output_dropout_prob,
            checkpoint_activations,
            checkpoint_num_layers,
            relative_encoding=relative_encoding,
            block_position_encoding=block_position_encoding)
        if spell_length is not None:
            self.spell_length = spell_length
            self.spell_embeddings = torch.nn.Embedding(self.spell_length,
                                                       self.hidden_size)
            self.lstm_head = torch.nn.LSTM(
                input_size=self.hidden_size,
                hidden_size=self.hidden_size,
                num_layers=2,
                # dropout=self.lstm_dropout,
                bidirectional=True,
                batch_first=True)  # .to(torch.device("cuda"))
            self.mlp_head = torch.nn.Sequential(
                torch.nn.Linear(2 * self.hidden_size, self.hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(self.hidden_size, self.hidden_size))
Esempio n. 2
0
    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 deepspeed_sparsity_config=None,
                 sparse_mode=None):

        super(GPT2Model, self).__init__()

        self._conf_dict = conf_dict = {
            'vocab_size': vocab_size,
            'n_positions': max_sequence_length,
            'n_ctx': max_sequence_length,
            'n_embd': hidden_size,
            'n_layer': num_layers,
            'n_head': num_attention_heads
        }

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)
        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(
            num_layers,
            hidden_size,
            num_attention_heads,
            attention_dropout_prob,
            output_dropout_prob,
            checkpoint_activations,
            checkpoint_num_layers,
            use_deepspeed_sparse=deepspeed_sparsity_config,
            sparse_mode=sparse_mode)
Esempio n. 3
0
    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True):

        super(GPT2Model, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)

        # Token type embedding.
        # Add this as an optional field that can be added through
        # method call so we can load a pretrain model without
        # token types and add them as needed.
        self.tokentype_embeddings = None
        self.hidden_size = hidden_size

        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(num_layers,
                                                       hidden_size,
                                                       num_attention_heads,
                                                       attention_dropout_prob,
                                                       output_dropout_prob,
                                                       checkpoint_activations,
                                                       checkpoint_num_layers)
Esempio n. 4
0
    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_labels,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True):

        super(GPT2Model_C, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # self.cls = mpu.VocabParallelEmbedding(
        #     1024, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)
        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(num_layers,
                                                       hidden_size,
                                                       num_attention_heads,
                                                       attention_dropout_prob,
                                                       output_dropout_prob,
                                                       checkpoint_activations,
                                                       checkpoint_num_layers)
        self.linear = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.classifier = torch.nn.Linear(self.hidden_size, num_labels)
Esempio n. 5
0
    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 max_memory_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 output_predict=True):
        super(EncoderDecoder, self).__init__()

        self.parallel_output = parallel_output
        self.output_predict = output_predict

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Transformer
        self.encoder = mpu.GPT2ParallelTransformer(
            num_layers, hidden_size, num_attention_heads, max_sequence_length,
            max_memory_length, embedding_dropout_prob, attention_dropout_prob,
            output_dropout_prob, checkpoint_activations, checkpoint_num_layers)
        self.decoder = mpu.GPT2ParallelTransformer(num_layers,
                                                   hidden_size,
                                                   num_attention_heads,
                                                   max_sequence_length,
                                                   max_memory_length,
                                                   embedding_dropout_prob,
                                                   attention_dropout_prob,
                                                   output_dropout_prob,
                                                   checkpoint_activations,
                                                   checkpoint_num_layers,
                                                   use_decoder_layer=True)
    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 max_memory_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 relative_encoding=False):

        super(GPT2Model, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(
            num_layers,
            hidden_size,
            num_attention_heads,
            max_sequence_length,
            max_memory_length,
            embedding_dropout_prob,
            attention_dropout_prob,
            output_dropout_prob,
            checkpoint_activations,
            checkpoint_num_layers,
            relative_encoding=relative_encoding)
Esempio n. 7
0
    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 num_experts=1,
                 type_vocab_size=2):

        super(BertMixtureModel, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)
        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Token Type Enbeddings.
        self.token_type_embeddings = torch.nn.Embedding(
            type_vocab_size, hidden_size)

        # Initialize the token type embeddings.
        init_method(self.token_type_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        self.hrs_head = RowParallelLinear(hidden_size,
                                          1,
                                          input_is_parallel=True,
                                          init_method=init.xavier_normal_)

        init_method(self.hrs_head.weight)

        self.click_head = RowParallelLinear(hidden_size,
                                            1,
                                            input_is_parallel=True,
                                            init_method=init.xavier_normal_)

        init_method(self.click_head.weight)

        self.lpsat_head = RowParallelLinear(hidden_size,
                                            5,
                                            input_is_parallel=True,
                                            init_method=init.xavier_normal_)

        init_method(self.lpsat_head.weight)

        self.qc_head = RowParallelLinear(hidden_size,
                                         5,
                                         input_is_parallel=True,
                                         init_method=init.xavier_normal_)

        init_method(self.qc_head.weight)

        self.eff_head = RowParallelLinear(hidden_size,
                                          5,
                                          input_is_parallel=True,
                                          init_method=init.xavier_normal_)

        init_method(self.eff_head.weight)

        self.local_head = RowParallelLinear(hidden_size,
                                            5,
                                            input_is_parallel=True,
                                            init_method=init.xavier_normal_)

        init_method(self.local_head.weight)

        self.fresh_head = RowParallelLinear(hidden_size,
                                            5,
                                            input_is_parallel=True,
                                            init_method=init.xavier_normal_)

        init_method(self.fresh_head.weight)

        # Transformer
        self.transformer = mpu.BertParallelTransformer(num_layers,
                                                       hidden_size,
                                                       num_attention_heads,
                                                       attention_dropout_prob,
                                                       output_dropout_prob,
                                                       checkpoint_activations,
                                                       checkpoint_num_layers,
                                                       num_experts=num_experts)

        self.dropout = torch.nn.Dropout(output_dropout_prob)

        self.dense_hrs0 = ColumnParallelLinear(hidden_size,
                                               hidden_size,
                                               gather_output=False,
                                               init_method=init.xavier_normal_)

        self.dense_click0 = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            gather_output=False,
            init_method=init.xavier_normal_)

        self.dense_lpsat0 = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            gather_output=False,
            init_method=init.xavier_normal_)

        self.dense_qc0 = ColumnParallelLinear(hidden_size,
                                              hidden_size,
                                              gather_output=False,
                                              init_method=init.xavier_normal_)

        self.dense_eff0 = ColumnParallelLinear(hidden_size,
                                               hidden_size,
                                               gather_output=False,
                                               init_method=init.xavier_normal_)

        self.dense_local0 = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            gather_output=False,
            init_method=init.xavier_normal_)

        self.dense_fresh0 = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            gather_output=False,
            init_method=init.xavier_normal_)
Esempio n. 8
0
    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 num_experts=1,
                 type_vocab_size=2):

        super(BertMixtureModel_v0, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)
        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Token Type Enbeddings.
        self.token_type_embeddings = torch.nn.Embedding(
            type_vocab_size, hidden_size)

        # Initialize the token type embeddings.
        init_method(self.token_type_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        self.hrs_head = torch.nn.Linear(hidden_size, 1)

        init_method(self.hrs_head.weight)

        self.click_head = torch.nn.Linear(hidden_size, 1)

        init_method(self.click_head.weight)

        # Transformer
        self.transformer = mpu.BertParallelTransformer(num_layers,
                                                       hidden_size,
                                                       num_attention_heads,
                                                       attention_dropout_prob,
                                                       output_dropout_prob,
                                                       checkpoint_activations,
                                                       checkpoint_num_layers,
                                                       num_experts=num_experts)

        self.dropout = torch.nn.Dropout(output_dropout_prob)