def __init__(self, configs: Configs): self.device = torch.device('cpu') if torch.cuda.is_available(): self.device = torch.device('cuda:0') self.dataset = TinyShakespeareDataset(configs.seq_len) self.dataloader = DataLoader(self.dataset, batch_size=configs.batch_size, collate_fn=transpose_batch, shuffle=True) if configs.glu_variant == 'GLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Sigmoid(), True, False, False, False) elif configs.glu_variant == 'Bilinear': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Identity(), True, False, False, False) elif configs.glu_variant == 'ReGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU(), True, False, False, False) elif configs.glu_variant == 'GEGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU(), True, False, False, False) elif configs.glu_variant == 'SwiGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.SiLU(), True, False, False, False) elif configs.glu_variant == 'ReLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU()) elif configs.glu_variant == 'GELU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU()) else: raise ValueError(f'Unknown variant {configs.glu_variant}') n_chars = len(self.dataset.stoi) self.model = AutoregressiveModel( EmbeddingsWithPositionalEncoding(configs.d_model, n_chars), Encoder( TransformerLayer(d_model=configs.d_model, self_attn=MultiHeadAttention( configs.n_heads, configs.d_model, configs.dropout), src_attn=None, feed_forward=ffn, dropout_prob=configs.dropout), configs.n_layers), nn.Linear(configs.d_model, n_chars)) self.model.to(self.device) self.optimizer = Noam(self.model.parameters(), lr=1.0, warmup=2_000, d_model=configs.d_model) self.loss_func = nn.CrossEntropyLoss() self.epochs = configs.epochs self.grad_norm_clip = configs.grad_norm_clip # Set tracker configurations tracker.set_scalar("loss.*", True)
def _noam_optimizer(c: OptimizerConfigs): from labml_nn.optimizers.noam import Noam return Noam(c.parameters, lr=c.learning_rate, betas=c.betas, eps=c.eps, weight_decay=c.weight_decay_obj, amsgrad=c.amsgrad, warmup=c.warmup, d_model=c.d_model)
def __init__(self, configs: Configs): # Get the device self.device = torch.device('cpu') if torch.cuda.is_available(): self.device = torch.device('cuda:0') # Initialize the dataset self.dataset = TinyShakespeareDataset(configs.seq_len) # Initialize the dataloader self.dataloader = DataLoader(self.dataset, batch_size=configs.batch_size, collate_fn=transpose_batch, shuffle=True) # FFN with Gated Linear Unit # $$FFN_{GLU}(x)(x, W_1, V, W_2) = (\sigma(x W_1) \otimes x V) W_2$$ if configs.glu_variant == 'GLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Sigmoid(), True, False, False, False) # FFN with Bilinear hidden layer # $$FFN_{Bilinear}(x)(x, W_1, V, W_2) = (x W_1 \otimes x V) W_2$$ elif configs.glu_variant == 'Bilinear': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Identity(), True, False, False, False) # FFN with ReLU gate # $$FFN_{ReGLU}(x)(x, W_1, V, W_2) = (\max(0, x W_1) \otimes x V) W_2$$ elif configs.glu_variant == 'ReGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU(), True, False, False, False) # FFN with GELU gate # $$FFN_{GEGLU}(x)(x, W_1, V, W_2) = (\text{GELU}(x W_1) \otimes x V) W_2$$ elif configs.glu_variant == 'GEGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU(), True, False, False, False) # FFN with Swish gate # $$FFN_{SwiGLU}(x)(x, W_1, V, W_2) = (\text{Swish}_1(x W_1) \otimes x V) W_2$$ # where $\text{Swish}_\beta(x) = x \sigma(\beta x)$ elif configs.glu_variant == 'SwiGLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.SiLU(), True, False, False, False) # FFN with ReLU activation # $$FFN_{ReLU}(x)(x, W_1, W_2, b_1, b_2) = \text{ReLU}_1(x W_1 + b_1) W_2 + b_2$$ elif configs.glu_variant == 'ReLU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU()) # FFN with ReLU activation # $$FFN_{GELU}(x)(x, W_1, W_2, b_1, b_2) = \text{GELU}_1(x W_1 + b_1) W_2 + b_2$$ elif configs.glu_variant == 'GELU': ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU()) else: raise ValueError(f'Unknown variant {configs.glu_variant}') # Number of different characters n_chars = len(self.dataset.stoi) # Initialize [Multi-Head Attention module](../mha.html) mha = MultiHeadAttention(configs.n_heads, configs.d_model, configs.dropout) # Initialize the [Transformer Block](../models.html#TransformerLayer) transformer_layer = TransformerLayer(d_model=configs.d_model, self_attn=mha, src_attn=None, feed_forward=ffn, dropout_prob=configs.dropout) # Initialize the model with an # [embedding layer](../models.html#EmbeddingsWithPositionalEncoding) # (with fixed positional encoding) # [transformer encoder](../models.html#Encoder) and # a linear layer to generate logits. self.model = AutoregressiveModel( EmbeddingsWithPositionalEncoding(configs.d_model, n_chars), Encoder(transformer_layer, configs.n_layers), nn.Linear(configs.d_model, n_chars)) # Move the model to the current device self.model.to(self.device) # Initialize [Noam optimizer](../../optimizers/noam.html) self.optimizer = Noam(self.model.parameters(), lr=1.0, warmup=2_000, d_model=configs.d_model) # Cross-entropy loss self.loss_func = nn.CrossEntropyLoss() # Number of training epochs; # *note that our dataset definition repeats the data `seq_len` times in a single epoch self.epochs = configs.epochs # Gradient clipping norm self.grad_norm_clip = configs.grad_norm_clip # Set tracker configurations tracker.set_scalar("loss.*", True)
def train(): """ ## Create and train a small model """ # Create an experiment experiment.create(name='retro_small') # GPU device device = torch.device('cuda:0') # Load Tiny Shakespeare dataset tds = TextFileDataset( lab.get_data_path() / 'tiny_shakespeare.txt', list, url= 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' ) # Load [Retro dataset](dataset.html) train_dataset = Dataset(lab.get_data_path() / 'retro_train_dataset.json', tds) # Create dataloader train_dl = DataLoader(train_dataset, batch_size=4, sampler=RandomSampler(train_dataset, replacement=True)) # Hyper-parameters chunk_len = 16 d_model = 128 d_ff = 512 n_heads = 16 d_k = 16 # Create the nearest neighbor encoder nearest_neighbor_encoder = NearestNeighborEncoder(chunk_len, 6, {3}, d_model, n_heads, d_k, d_ff) # Create the model model = RetroModel(tds.n_tokens, d_model, 6, {3, 5}, chunk_len, n_heads, d_k, d_ff, encoder=nearest_neighbor_encoder) # Move the model to the device model = model.to(device) # Create the optimizer optimizer = Noam(model.parameters(), lr=1., d_model=d_model, warmup=2_000) # Create the `Trainer` trainer = Trainer(device, model, train_dl, optimizer) # Create the `Sampler` sampler = Sampler(device, model, tds, chunk_len) # prompt = '''Second Citizen:\nOne word, good citizens.\n\nFirst Citizen:''' # Set models for saving and loading experiment.add_pytorch_models(model=model) # Start the experiment with experiment.start(): # Train for `32` epochs for epoch in monit.loop(32): # Train trainer() # Print a new line tracker.new_line() # Sample from the `prompt` logger.log([(prompt.replace('\n', '\\n\n'), Text.subtle), (sampler.sample(prompt, 128).replace('\n', '\\n\n'), Text.none)]) # Save models experiment.save_checkpoint()