Python Noam.Noam Exemples, labml_nn.optimizers.noam.Noam.Noam Python Exemples

Exemple #1

0

Afficher le fichier

    def __init__(self, configs: Configs):
        self.device = torch.device('cpu')
        if torch.cuda.is_available():
            self.device = torch.device('cuda:0')
        self.dataset = TinyShakespeareDataset(configs.seq_len)
        self.dataloader = DataLoader(self.dataset,
                                     batch_size=configs.batch_size,
                                     collate_fn=transpose_batch,
                                     shuffle=True)

        if configs.glu_variant == 'GLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.Sigmoid(), True, False, False, False)
        elif configs.glu_variant == 'Bilinear':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.Identity(), True, False, False, False)
        elif configs.glu_variant == 'ReGLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.ReLU(), True, False, False, False)
        elif configs.glu_variant == 'GEGLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.GELU(), True, False, False, False)
        elif configs.glu_variant == 'SwiGLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.SiLU(), True, False, False, False)
        elif configs.glu_variant == 'ReLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.ReLU())
        elif configs.glu_variant == 'GELU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.GELU())
        else:
            raise ValueError(f'Unknown variant {configs.glu_variant}')

        n_chars = len(self.dataset.stoi)
        self.model = AutoregressiveModel(
            EmbeddingsWithPositionalEncoding(configs.d_model, n_chars),
            Encoder(
                TransformerLayer(d_model=configs.d_model,
                                 self_attn=MultiHeadAttention(
                                     configs.n_heads, configs.d_model,
                                     configs.dropout),
                                 src_attn=None,
                                 feed_forward=ffn,
                                 dropout_prob=configs.dropout),
                configs.n_layers), nn.Linear(configs.d_model, n_chars))
        self.model.to(self.device)

        self.optimizer = Noam(self.model.parameters(),
                              lr=1.0,
                              warmup=2_000,
                              d_model=configs.d_model)

        self.loss_func = nn.CrossEntropyLoss()
        self.epochs = configs.epochs
        self.grad_norm_clip = configs.grad_norm_clip

        # Set tracker configurations
        tracker.set_scalar("loss.*", True)

Exemple #2

0

Afficher le fichier

def _noam_optimizer(c: OptimizerConfigs):
    from labml_nn.optimizers.noam import Noam
    return Noam(c.parameters,
                lr=c.learning_rate,
                betas=c.betas,
                eps=c.eps,
                weight_decay=c.weight_decay_obj,
                amsgrad=c.amsgrad,
                warmup=c.warmup,
                d_model=c.d_model)

Exemple #3

0

Afficher le fichier

    def __init__(self, configs: Configs):
        # Get the device
        self.device = torch.device('cpu')
        if torch.cuda.is_available():
            self.device = torch.device('cuda:0')
        # Initialize the dataset
        self.dataset = TinyShakespeareDataset(configs.seq_len)
        # Initialize the dataloader
        self.dataloader = DataLoader(self.dataset,
                                     batch_size=configs.batch_size,
                                     collate_fn=transpose_batch,
                                     shuffle=True)

        # FFN with Gated Linear Unit
        # $$FFN_{GLU}(x)(x, W_1, V, W_2) = (\sigma(x W_1) \otimes x V) W_2$$
        if configs.glu_variant == 'GLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.Sigmoid(), True, False, False, False)
        # FFN with Bilinear hidden layer
        # $$FFN_{Bilinear}(x)(x, W_1, V, W_2) = (x W_1 \otimes x V) W_2$$
        elif configs.glu_variant == 'Bilinear':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.Identity(), True, False, False, False)
        # FFN with ReLU gate
        # $$FFN_{ReGLU}(x)(x, W_1, V, W_2) = (\max(0, x W_1) \otimes x V) W_2$$
        elif configs.glu_variant == 'ReGLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.ReLU(), True, False, False, False)
        # FFN with GELU gate
        # $$FFN_{GEGLU}(x)(x, W_1, V, W_2) = (\text{GELU}(x W_1) \otimes x V) W_2$$
        elif configs.glu_variant == 'GEGLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.GELU(), True, False, False, False)
        # FFN with Swish gate
        # $$FFN_{SwiGLU}(x)(x, W_1, V, W_2) = (\text{Swish}_1(x W_1) \otimes x V) W_2$$
        # where $\text{Swish}_\beta(x) = x \sigma(\beta x)$
        elif configs.glu_variant == 'SwiGLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.SiLU(), True, False, False, False)
        # FFN with ReLU activation
        # $$FFN_{ReLU}(x)(x, W_1, W_2, b_1, b_2) = \text{ReLU}_1(x W_1 + b_1) W_2 + b_2$$
        elif configs.glu_variant == 'ReLU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.ReLU())
        # FFN with ReLU activation
        # $$FFN_{GELU}(x)(x, W_1, W_2, b_1, b_2) = \text{GELU}_1(x W_1 + b_1) W_2 + b_2$$
        elif configs.glu_variant == 'GELU':
            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout,
                              nn.GELU())
        else:
            raise ValueError(f'Unknown variant {configs.glu_variant}')

        # Number of different characters
        n_chars = len(self.dataset.stoi)

        # Initialize [Multi-Head Attention module](../mha.html)
        mha = MultiHeadAttention(configs.n_heads, configs.d_model,
                                 configs.dropout)
        # Initialize the [Transformer Block](../models.html#TransformerLayer)
        transformer_layer = TransformerLayer(d_model=configs.d_model,
                                             self_attn=mha,
                                             src_attn=None,
                                             feed_forward=ffn,
                                             dropout_prob=configs.dropout)
        # Initialize the model with an
        # [embedding layer](../models.html#EmbeddingsWithPositionalEncoding)
        # (with fixed positional encoding)
        # [transformer encoder](../models.html#Encoder) and
        # a linear layer to generate logits.
        self.model = AutoregressiveModel(
            EmbeddingsWithPositionalEncoding(configs.d_model, n_chars),
            Encoder(transformer_layer, configs.n_layers),
            nn.Linear(configs.d_model, n_chars))

        # Move the model to the current device
        self.model.to(self.device)

        # Initialize [Noam optimizer](../../optimizers/noam.html)
        self.optimizer = Noam(self.model.parameters(),
                              lr=1.0,
                              warmup=2_000,
                              d_model=configs.d_model)

        # Cross-entropy loss
        self.loss_func = nn.CrossEntropyLoss()
        # Number of training epochs;
        # *note that our dataset definition repeats the data `seq_len` times in a single epoch
        self.epochs = configs.epochs
        # Gradient clipping norm
        self.grad_norm_clip = configs.grad_norm_clip

        # Set tracker configurations
        tracker.set_scalar("loss.*", True)

Exemple #4

0

Afficher le fichier

Fichier : train.py Projet : weihaoxie/nn

def train():
    """
    ## Create and train a small model
    """

    # Create an experiment
    experiment.create(name='retro_small')

    # GPU device
    device = torch.device('cuda:0')

    # Load Tiny Shakespeare dataset
    tds = TextFileDataset(
        lab.get_data_path() / 'tiny_shakespeare.txt',
        list,
        url=
        'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    )

    # Load [Retro dataset](dataset.html)
    train_dataset = Dataset(lab.get_data_path() / 'retro_train_dataset.json',
                            tds)

    # Create dataloader
    train_dl = DataLoader(train_dataset,
                          batch_size=4,
                          sampler=RandomSampler(train_dataset,
                                                replacement=True))

    # Hyper-parameters
    chunk_len = 16
    d_model = 128
    d_ff = 512
    n_heads = 16
    d_k = 16

    # Create the nearest neighbor encoder
    nearest_neighbor_encoder = NearestNeighborEncoder(chunk_len, 6, {3},
                                                      d_model, n_heads, d_k,
                                                      d_ff)
    # Create the model
    model = RetroModel(tds.n_tokens,
                       d_model,
                       6, {3, 5},
                       chunk_len,
                       n_heads,
                       d_k,
                       d_ff,
                       encoder=nearest_neighbor_encoder)
    # Move the model to the device
    model = model.to(device)
    # Create the optimizer
    optimizer = Noam(model.parameters(), lr=1., d_model=d_model, warmup=2_000)
    # Create the `Trainer`
    trainer = Trainer(device, model, train_dl, optimizer)
    # Create the `Sampler`
    sampler = Sampler(device, model, tds, chunk_len)
    #
    prompt = '''Second Citizen:\nOne word, good citizens.\n\nFirst Citizen:'''

    # Set models for saving and loading
    experiment.add_pytorch_models(model=model)

    # Start the experiment
    with experiment.start():
        # Train for `32` epochs
        for epoch in monit.loop(32):
            # Train
            trainer()
            # Print a new line
            tracker.new_line()
            # Sample from the `prompt`
            logger.log([(prompt.replace('\n', '\\n\n'), Text.subtle),
                        (sampler.sample(prompt,
                                        128).replace('\n',
                                                     '\\n\n'), Text.none)])
            # Save models
            experiment.save_checkpoint()