Python VocabParallelEmbedding Examples, mpu.VocabParallelEmbedding Python Examples

Example #1

0

Show file

File: modeling_glm.py Project: puraminy/GLM

    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 max_memory_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 relative_encoding=False,
                 block_position_encoding=False,
                 output_predict=True,
                 spell_length=None):

        super(GLMModel, self).__init__()

        self.parallel_output = parallel_output
        self.output_predict = output_predict
        self.hidden_size = hidden_size

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(
            num_layers,
            hidden_size,
            num_attention_heads,
            max_sequence_length,
            max_memory_length,
            embedding_dropout_prob,
            attention_dropout_prob,
            output_dropout_prob,
            checkpoint_activations,
            checkpoint_num_layers,
            relative_encoding=relative_encoding,
            block_position_encoding=block_position_encoding)
        if spell_length is not None:
            self.spell_length = spell_length
            self.spell_embeddings = torch.nn.Embedding(self.spell_length,
                                                       self.hidden_size)
            self.lstm_head = torch.nn.LSTM(
                input_size=self.hidden_size,
                hidden_size=self.hidden_size,
                num_layers=2,
                # dropout=self.lstm_dropout,
                bidirectional=True,
                batch_first=True)  # .to(torch.device("cuda"))
            self.mlp_head = torch.nn.Sequential(
                torch.nn.Linear(2 * self.hidden_size, self.hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(self.hidden_size, self.hidden_size))

Example #2

0

Show file

File: model.py Project: php4nuke/ru-gpts

    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 deepspeed_sparsity_config=None,
                 sparse_mode=None):

        super(GPT2Model, self).__init__()

        self._conf_dict = conf_dict = {
            'vocab_size': vocab_size,
            'n_positions': max_sequence_length,
            'n_ctx': max_sequence_length,
            'n_embd': hidden_size,
            'n_layer': num_layers,
            'n_head': num_attention_heads
        }

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)
        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(
            num_layers,
            hidden_size,
            num_attention_heads,
            attention_dropout_prob,
            output_dropout_prob,
            checkpoint_activations,
            checkpoint_num_layers,
            use_deepspeed_sparse=deepspeed_sparsity_config,
            sparse_mode=sparse_mode)

Example #3

0

Show file

File: gpt2_modeling.py Project: yrchen92/CoDIR

    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True):

        super(GPT2Model, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)

        # Token type embedding.
        # Add this as an optional field that can be added through
        # method call so we can load a pretrain model without
        # token types and add them as needed.
        self.tokentype_embeddings = None
        self.hidden_size = hidden_size

        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(num_layers,
                                                       hidden_size,
                                                       num_attention_heads,
                                                       attention_dropout_prob,
                                                       output_dropout_prob,
                                                       checkpoint_activations,
                                                       checkpoint_num_layers)

Example #4

0

Show file

File: gpt2_modeling.py Project: blueseasky/CPM-Finetune

    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_labels,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True):

        super(GPT2Model_C, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # self.cls = mpu.VocabParallelEmbedding(
        #     1024, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)
        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(num_layers,
                                                       hidden_size,
                                                       num_attention_heads,
                                                       attention_dropout_prob,
                                                       output_dropout_prob,
                                                       checkpoint_activations,
                                                       checkpoint_num_layers)
        self.linear = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.classifier = torch.nn.Linear(self.hidden_size, num_labels)

Example #5

0

Show file

File: modeling_glm.py Project: puraminy/GLM

    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 max_memory_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 output_predict=True):
        super(EncoderDecoder, self).__init__()

        self.parallel_output = parallel_output
        self.output_predict = output_predict

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Transformer
        self.encoder = mpu.GPT2ParallelTransformer(
            num_layers, hidden_size, num_attention_heads, max_sequence_length,
            max_memory_length, embedding_dropout_prob, attention_dropout_prob,
            output_dropout_prob, checkpoint_activations, checkpoint_num_layers)
        self.decoder = mpu.GPT2ParallelTransformer(num_layers,
                                                   hidden_size,
                                                   num_attention_heads,
                                                   max_sequence_length,
                                                   max_memory_length,
                                                   embedding_dropout_prob,
                                                   attention_dropout_prob,
                                                   output_dropout_prob,
                                                   checkpoint_activations,
                                                   checkpoint_num_layers,
                                                   use_decoder_layer=True)

Example #6

0

Show file

File: gpt2_modeling.py Project: jsyzc2019/Chinese-Transformer-XL

    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 max_sequence_length,
                 max_memory_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 relative_encoding=False):

        super(GPT2Model, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Transformer
        self.transformer = mpu.GPT2ParallelTransformer(
            num_layers,
            hidden_size,
            num_attention_heads,
            max_sequence_length,
            max_memory_length,
            embedding_dropout_prob,
            attention_dropout_prob,
            output_dropout_prob,
            checkpoint_activations,
            checkpoint_num_layers,
            relative_encoding=relative_encoding)

Example #7

0

Show file

    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 num_experts=1,
                 type_vocab_size=2):

        super(BertMixtureModel, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)
        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Token Type Enbeddings.
        self.token_type_embeddings = torch.nn.Embedding(
            type_vocab_size, hidden_size)

        # Initialize the token type embeddings.
        init_method(self.token_type_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        self.hrs_head = RowParallelLinear(hidden_size,
                                          1,
                                          input_is_parallel=True,
                                          init_method=init.xavier_normal_)

        init_method(self.hrs_head.weight)

        self.click_head = RowParallelLinear(hidden_size,
                                            1,
                                            input_is_parallel=True,
                                            init_method=init.xavier_normal_)

        init_method(self.click_head.weight)

        self.lpsat_head = RowParallelLinear(hidden_size,
                                            5,
                                            input_is_parallel=True,
                                            init_method=init.xavier_normal_)

        init_method(self.lpsat_head.weight)

        self.qc_head = RowParallelLinear(hidden_size,
                                         5,
                                         input_is_parallel=True,
                                         init_method=init.xavier_normal_)

        init_method(self.qc_head.weight)

        self.eff_head = RowParallelLinear(hidden_size,
                                          5,
                                          input_is_parallel=True,
                                          init_method=init.xavier_normal_)

        init_method(self.eff_head.weight)

        self.local_head = RowParallelLinear(hidden_size,
                                            5,
                                            input_is_parallel=True,
                                            init_method=init.xavier_normal_)

        init_method(self.local_head.weight)

        self.fresh_head = RowParallelLinear(hidden_size,
                                            5,
                                            input_is_parallel=True,
                                            init_method=init.xavier_normal_)

        init_method(self.fresh_head.weight)

        # Transformer
        self.transformer = mpu.BertParallelTransformer(num_layers,
                                                       hidden_size,
                                                       num_attention_heads,
                                                       attention_dropout_prob,
                                                       output_dropout_prob,
                                                       checkpoint_activations,
                                                       checkpoint_num_layers,
                                                       num_experts=num_experts)

        self.dropout = torch.nn.Dropout(output_dropout_prob)

        self.dense_hrs0 = ColumnParallelLinear(hidden_size,
                                               hidden_size,
                                               gather_output=False,
                                               init_method=init.xavier_normal_)

        self.dense_click0 = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            gather_output=False,
            init_method=init.xavier_normal_)

        self.dense_lpsat0 = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            gather_output=False,
            init_method=init.xavier_normal_)

        self.dense_qc0 = ColumnParallelLinear(hidden_size,
                                              hidden_size,
                                              gather_output=False,
                                              init_method=init.xavier_normal_)

        self.dense_eff0 = ColumnParallelLinear(hidden_size,
                                               hidden_size,
                                               gather_output=False,
                                               init_method=init.xavier_normal_)

        self.dense_local0 = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            gather_output=False,
            init_method=init.xavier_normal_)

        self.dense_fresh0 = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            gather_output=False,
            init_method=init.xavier_normal_)

Example #8

0

Show file

    def __init__(self,
                 num_layers,
                 vocab_size,
                 hidden_size,
                 num_attention_heads,
                 embedding_dropout_prob,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 max_sequence_length,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 parallel_output=True,
                 num_experts=1,
                 type_vocab_size=2):

        super(BertMixtureModel_v0, self).__init__()

        self.parallel_output = parallel_output

        init_method = init_method_normal(std=0.02)

        # Word embeddings (parallel).
        self.word_embeddings = mpu.VocabParallelEmbedding(
            vocab_size, hidden_size, init_method=init_method)

        # Position embedding (serial).
        self.position_embeddings = torch.nn.Embedding(max_sequence_length,
                                                      hidden_size)
        # Initialize the position embeddings.
        init_method(self.position_embeddings.weight)

        # Token Type Enbeddings.
        self.token_type_embeddings = torch.nn.Embedding(
            type_vocab_size, hidden_size)

        # Initialize the token type embeddings.
        init_method(self.token_type_embeddings.weight)

        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        self.hrs_head = torch.nn.Linear(hidden_size, 1)

        init_method(self.hrs_head.weight)

        self.click_head = torch.nn.Linear(hidden_size, 1)

        init_method(self.click_head.weight)

        # Transformer
        self.transformer = mpu.BertParallelTransformer(num_layers,
                                                       hidden_size,
                                                       num_attention_heads,
                                                       attention_dropout_prob,
                                                       output_dropout_prob,
                                                       checkpoint_activations,
                                                       checkpoint_num_layers,
                                                       num_experts=num_experts)

        self.dropout = torch.nn.Dropout(output_dropout_prob)