def main() -> None: r"""Script entry point.""" # Parse command-line argument. args = parse_arg() # Set random seed for reproducibility. lmp.util.rand.set_seed(seed=args.seed) # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer configuration. tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load( exp_name=tknzr_cfg.exp_name, tknzr_name=tknzr_cfg.tknzr_name, ) # Load pre-trained model instance. model = lmp.util.model.load( ckpt=args.ckpt, tknzr=tknzr, **model_cfg.__dict__, ) # Get inference method. infer = lmp.util.infer.create( max_seq_len=model_cfg.max_seq_len, **args.__dict__, ) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Set model to evaluation model. # This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Generate text with specified inference method. txt = infer.gen(model=model, tknzr=tknzr, txt=args.txt) # Output generate text. print(txt)
def main() -> None: r"""Script entry point.""" # Parse command-line argument. args = parse_arg() # Get dataset instance with specified version. dset = lmp.util.dset.load(dset_name=args.dset_name, ver=args.ver) # Mini-batch random sampler. dldr = torch.utils.data.DataLoader( dataset=dset, batch_size=args.batch_size, shuffle=False, ) # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer configuration. tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load( exp_name=tknzr_cfg.exp_name, tknzr_name=tknzr_cfg.tknzr_name, ) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Get tensorboard logger instance. writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) # Load pre-trained checkpoints range from `args.first_ckpt` to # `args.last_ckpt`. for ckpt in lmp.util.model.list_ckpts( exp_name=args.exp_name, first_ckpt=args.first_ckpt, last_ckpt=args.last_ckpt, ): # Load pre-trained model instance from checkpoint `ckpt`. model = lmp.util.model.load( ckpt=ckpt, tknzr=tknzr, **model_cfg.__dict__, ) # Set model to evaluation model. # This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Record average perplexity. avg_ppl = 0.0 for batch_txt in tqdm(dldr): # Encode batch text into batch of token ids. batch_tkids = tknzr.batch_enc( batch_txt=batch_txt, max_seq_len=model_cfg.max_seq_len, ) # Convert batch of token ids to `torch.Tensor` with # `dtype == torch.int64`. batch_tkids = torch.LongTensor(batch_tkids) # Move tensors to model running device. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_prev_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Calculate perplexity. batch_avg_ppl = model.ppl( batch_next_tkids=batch_next_tkids, batch_prev_tkids=batch_prev_tkids, ) # Accumulate average perplexity. avg_ppl += batch_avg_ppl * len(batch_txt) / len(dset) # Log average perplexity on dataset to CLI and tensorboard. writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt) print(f'checkpoint {ckpt} ppl: {avg_ppl}')
def main(argv: List[str]) -> None: """Script entry point. Parameters ---------- argv: list[str] List of CLI arguments. Returns ------- None """ # Parse CLI arguments. args = parse_args(argv=argv) # `args.batch_size` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[1, args.batch_size], val_names=['1', 'args.batch_size']) # `args.first_ckpt` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[-1, args.first_ckpt], val_names=['-1', 'args.first_ckpt']) # `args.last_ckpt` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[-1, args.last_ckpt], val_names=['-1', 'args.last_ckpt']) # `args.n_worker` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[0, args.n_worker, len(os.sched_getaffinity(0))], val_names=['0', 'args.n_worker', 'number of available CPUs'], ) lmp.util.validate.raise_if_wrong_ordered( vals=[args.n_worker, args.batch_size], val_names=['args.n_worker', 'args.batch_size'], ) # We use TCP to perform RPC. Timeout is set to 5 minutes. store = dist.TCPStore( is_master=args.rank == HOST_RANK, host_name=args.host_name, port=args.host_port, timeout=timedelta(minutes=5), world_size=args.world_size, ) # Use NCCL backend to perform CUDA collectives. dist.init_process_group( backend=dist.Backend.NCCL, store=store, rank=args.rank, timeout=timedelta(minutes=5), world_size=args.world_size, ) # Sync arguments. dist_args_k = [ 'host_name', 'host_port', 'local_rank', 'rank', 'world_size' ] for k in args.__dict__.keys(): if k in dist_args_k: continue # Host broadcast arguments. if args.rank == HOST_RANK: store.set(k, str(args.__dict__[k])) # Non-host receive host arguments. else: v = store.get(k) if isinstance(args.__dict__[k], str): args.__dict__[k] = v.decode('utf-8') else: args.__dict__[k] = type(args.__dict__[k])(v) # Set random seed for reproducibility. Note that each process use different seed to get different slice of batch. lmp.util.rand.set_seed(seed=args.seed + args.rank) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device(f'cuda:{args.local_rank}') # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name) # Get dataset instance and convert samples to tensor. if args.is_dset_in_memory: dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=model_cfg.max_seq_len, tknzr=tknzr, ) else: dset = lmp.util.dset.SlowTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=model_cfg.max_seq_len, tknzr=tknzr, ) dset_size = len(dset) # Mini-batch sampler. Each process will get batches exclusive to itself. dist_sampler = torch.utils.data.distributed.DistributedSampler( num_replicas=args.world_size, rank=args.rank, dataset=dset, shuffle=False, ) # Mini-batch distributed random sampler. Only when `args.n_worker > 0` we set `persisten_worker = True`. We set # `pin_memory = True` to speed up process (which only speed up a few seconds). data_loader = torch.utils.data.DataLoader( batch_size=args.batch_size // args.world_size, dataset=dset, num_workers=args.n_worker, persistent_workers=bool(args.n_worker != 0), pin_memory=True, sampler=dist_sampler, ) # Get tensorboard logger instance. Only main process need to log performance. if args.rank == HOST_RANK: writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) else: writer = None # Evaluate checkpoints within ranges. for ckpt in lmp.util.model.list_ckpts(exp_name=args.exp_name, first_ckpt=args.first_ckpt, last_ckpt=args.last_ckpt): # Load pre-trained model instance. model = lmp.util.model.load(ckpt=ckpt, exp_name=args.exp_name) # Set model to evaluation model. This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Create DDP model. dpp_model = torch.nn.parallel.DistributedDataParallel(model) # Processes can have unevenly distributed number of batch. Thus one must use `ddp_model.join()` to avoid dead lock. with dpp_model.join(): # Record average perplexity. avg_ppl = 0.0 for batch_tkids in tqdm(data_loader): # Encode text into token ids. We convert token ids into tensor and move to the same running device as model. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_cur_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Loop over token ids to get next token id prediction probability distribution. batch_prev_states = None batch_tkids_pd = [] for i in range(batch_cur_tkids.size(1)): batch_next_tkids_pd, batch_prev_states = model.pred( batch_cur_tkids=batch_cur_tkids[:, i], batch_prev_states=batch_prev_states, ) # Collect prediction probability distribution. batch_tkids_pd.append(batch_next_tkids_pd) # Calculate perplexity. batch_ppl = lmp.util.metric.ppl(batch_tkids=batch_next_tkids, batch_tkids_pd=torch.stack( batch_tkids_pd, dim=1)) # Sum `batch_ppl` from each process. dist.all_reduce(batch_ppl, op=dist.ReduceOp.SUM) # Accumulate average perplexity. avg_ppl += (batch_ppl / dset_size).sum().item() # Log average perplexity on dataset to CLI and tensorboard. Only main process need to log performance. if args.rank == HOST_RANK: writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt) print(f'checkpoint: {ckpt}, avg ppl: {avg_ppl}') # Free memory. This is only need for unit test. del args del avg_ppl del batch_cur_tkids del batch_next_tkids del batch_next_tkids_pd del batch_ppl del batch_prev_states del batch_tkids del batch_tkids_pd del ckpt del data_loader del device del dset del dset_size del model del model_cfg del tknzr del writer torch.cuda.empty_cache() gc.collect()
def main() -> None: r"""Script entry point.""" # Parse command-line argument. args = parse_arg() # Save training configuration. lmp.util.cfg.save(args=args, exp_name=args.exp_name) # Set random seed for reproducibility. lmp.util.rand.set_seed(seed=args.seed) # Get dataset instance with specified version. dset = lmp.util.dset.load(dset_name=args.dset_name, ver=args.ver) # Mini-batch random sampler. dldr = torch.utils.data.DataLoader( dataset=dset, batch_size=args.batch_size, shuffle=True, ) # Load pre-trained tokenizer. tknzr_cfg = lmp.util.cfg.load(exp_name=args.tknzr_exp_name) tknzr = lmp.util.tknzr.load( exp_name=args.tknzr_exp_name, tknzr_name=tknzr_cfg.tknzr_name, ) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Get new model instance. model = lmp.util.model.create(tknzr=tknzr, **args.__dict__) model = model.train() # Move model to running device. model = model.to(device) # Remove weight decay on bias and layer-norm. no_decay = ['bias', 'LayerNorm.weight'] optim_group_params = [ { 'params': [ param for name, param in model.named_parameters() if not any(nd in name for nd in no_decay) ], 'weight_decay': args.wd, }, { 'params': [ param for name, param in model.named_parameters() if any(nd in name for nd in no_decay) ], 'weight_decay': 0.0, }, ] # Get new optimizer instance. optim = torch.optim.AdamW( optim_group_params, betas=(args.beta1, args.beta2), lr=args.lr, eps=args.eps, ) # Get tensorboard logger instance. writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) # Log performance target. pre_avg_loss = 0.0 avg_loss = 0.0 # Global optimization step. step = 0 for epoch in range(args.n_epoch): tqdm_dldr = tqdm( dldr, desc=f'epoch: {epoch}, loss: {pre_avg_loss:.6f}', ) for batch_txt in tqdm_dldr: # Encode batch text into batch token ids. batch_tkids = tknzr.batch_enc( batch_txt=batch_txt, max_seq_len=args.max_seq_len, ) # Convert batch token ids to `torch.Tensor` with # `dtype == torch.int64`. batch_tkids = torch.LongTensor(batch_tkids) # Move tensors to model running device. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_prev_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Calculate loss using loss function. loss = model.loss_fn( batch_next_tkids=batch_next_tkids, batch_prev_tkids=batch_prev_tkids, ) # Accumulate average loss. avg_loss += loss.item() # Backward pass / back propagation. loss.backward() # Perform gradient clipping to avoid gradient explosion. torch.nn.utils.clip_grad_norm_( model.parameters(), max_norm=args.max_norm, ) # Gradient descent. optim.step() # Clean up gradient. # This is needed only in `torch`. optim.zero_grad() # Increment global step. step += 1 # Save checkpoint for each `ckpt_step` step. if step % args.ckpt_step == 0: model.save(ckpt=step, exp_name=args.exp_name) # Log performance for each `log_step` step. if step % args.log_step == 0: avg_loss = avg_loss / args.log_step # Log on CLI. tqdm_dldr.set_description( f'epoch: {epoch}, loss: {avg_loss:.6f}', ) # Log on tensorboard writer.add_scalar( f'loss/{args.dset_name}/{args.ver}', avg_loss, step, ) # Refresh log performance. pre_avg_loss = avg_loss avg_loss = 0.0 # Save last checkpoint. model.save(ckpt=step, exp_name=args.exp_name) # Close tensorboard logger. writer.close()
def main(argv: List[str]) -> None: """Script entry point. Parameters ---------- argv: list[str] List of CLI arguments. Returns ------- None """ # Parse CLI arguments. args = parse_args(argv=argv) # `args.ckpt` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.ckpt], val_names=['-1', 'args.ckpt']) # `args.txt` validation. lmp.util.validate.raise_if_empty_str(val=args.txt, val_name='args.txt') # Set random seed for reproducibility. lmp.util.rand.set_seed(seed=args.seed) # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name) # Load pre-trained model instance. model = lmp.util.model.load(ckpt=args.ckpt, exp_name=args.exp_name) # Set model to evaluation model. This turn off dropout layers in model. model = model.eval() # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Move model to running device. model = model.to(device) # Get inference method. infer = lmp.util.infer.create(**args.__dict__) # Generate text with specified inference method. txt = infer.gen(model=model, tknzr=tknzr, txt=args.txt) # Output generate text. print(txt) # Free memory. This is only need for unit test. del args del device del infer del model del model_cfg del tknzr del txt torch.cuda.empty_cache() gc.collect()
def load_model( checkpoint: int, d_emb: int, d_hid: int, device: torch.device, dropout: float, experiment: str, model_class: str, num_linear_layers: int, num_rnn_layers: int, pad_token_id: int, vocab_size: int ) -> lmp.model.BaseRNNModel: r"""Helper function for constructing language model. Load optimizer from pre-trained checkpoint when `checkpoint != -1`. Args: checkpoint: Pre-trained model's checkpoint. d_emb: Embedding matrix vector dimension. d_hid: GRU layers hidden dimension. device: Model running device. dropout: Dropout probability on all layers out (except output layer). experiment: Name of the pre-trained experiment. num_rnn_layers: Number of GRU layers to use. num_linear_layers: Number of Linear layers to use. pad_token_id: Padding token's id. Embedding layers will initialize padding token's vector with zeros. vocab_size: Embedding matrix vocabulary dimension. Raises: ValueError: If `model` does not supported. Returns: `lmp.model.BaseRNNModel` if `model_class == 'rnn'`; `lmp.model.GRUModel` if `model_class == 'gru'`; `lmp.model.LSTMModel` if `model_class == 'lstm'`. """ if model_class == 'rnn': model = lmp.model.BaseRNNModel( d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size ) elif model_class == 'gru': model = lmp.model.GRUModel( d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size ) elif model_class == 'lstm': model = lmp.model.LSTMModel( d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size ) else: raise ValueError( f'model `{model_class}` does not support.\nSupported options:' + ''.join(list(map( lambda option: f'\n\t--model {option}', [ 'rnn', 'gru', 'lstm', ] ))) ) if checkpoint != -1: file_path = f'{lmp.path.DATA_PATH}/{experiment}/model-{checkpoint}.pt' if not os.path.exists(file_path): raise FileNotFoundError(f'file {file_path} does not exist.') model.load_state_dict(torch.load(file_path)) return model.to(device)
def load_model( checkpoint: int, d_emb: int, d_hid: int, device: torch.device, dropout: float, experiment: str, model_class: str, num_linear_layers: int, num_rnn_layers: int, pad_token_id: int, vocab_size: int ) -> Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]: r"""Helper function for constructing language model. Supported options: --model_class rnn --model_class gru --model_class lstm --model_class res_rnn --model_class res_gru --model_class res_lstm Load model from pre-trained checkpoint when `checkpoint != -1`. Args: checkpoint: Pre-trained model's checkpoint. Must be bigger than or equal to `-1`. d_emb: Embedding matrix vector dimension. Must be bigger than or equal to `1`. d_hid: Model layers hidden dimension. Must be bigger than or equal to `1`. device: Model running device. dropout: Dropout probability on all layers output (except output layer). Must range from `0.0` to `1.0`. experiment: Name of the pre-trained experiment. Must not be empty when `checkpoint != -1`. num_linear_layers: Number of Linear layers to use. Must be bigger than or equal to `1`. num_rnn_layers: Number of RNN layers to use. Must be bigger than or equal to `1`. pad_token_id: Padding token's id. Embedding layer will initialize padding token's vector with zeros. Must be bigger than or equal to `0`, and must be smaller than `vocab_size`. vocab_size: Embedding matrix vocabulary dimension. Must be bigger than or equal to `1`. Raises: TypeError: When one of the arguments are not an instance of their type annotation respectively. ValueError: When one of the arguments do not follow their constraints. See docstring for arguments constraints. Returns: `lmp.model.BaseRNNModel` if `model_class == 'rnn'`; `lmp.model.GRUModel` if `model_class == 'gru'`; `lmp.model.LSTMModel` if `model_class == 'lstm'`; `lmp.model.BaseResRNNModel` if `model_class == 'res_rnn'`; `lmp.model.ResGRUModel` if `model_class == 'res_gru'`; `lmp.model.ResLSTMModel` if `model_class == 'res_lstm'`. """ # Type check. if not isinstance(checkpoint, int): raise TypeError('`checkpoint` must be an instance of `int`.') if not isinstance(experiment, str): raise TypeError('`experiment` must be an instance of `str`.') if not isinstance(device, torch.device): raise TypeError('`device` must be an instance of `torch.device`.') if not isinstance(model_class, str): raise TypeError('`model_class` must be an instance of `str`.') # Value Check. if checkpoint < -1: raise ValueError('`checkpoint` must be bigger than or equal to `-1`.') if model_class == 'rnn': model = lmp.model.BaseRNNModel(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) elif model_class == 'gru': model = lmp.model.GRUModel(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) elif model_class == 'lstm': model = lmp.model.LSTMModel(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) elif model_class == 'res_rnn': model = lmp.model.BaseResRNNModel(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) elif model_class == 'res_gru': model = lmp.model.ResGRUModel(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) elif model_class == 'res_lstm': model = lmp.model.ResLSTMModel(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_rnn_layers=num_rnn_layers, num_linear_layers=num_linear_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) else: raise ValueError( f'model `{model_class}` does not support.\nSupported options:' + ''.join( list( map(lambda option: f'\n\t--model_class {option}', [ 'rnn', 'gru', 'lstm', 'res_rnn', 'res_gru', 'res_lstm', ])))) if checkpoint != -1: if not experiment: raise ValueError('`experiment` must not be empty.') file_path = os.path.join(lmp.path.DATA_PATH, experiment, f'model-{checkpoint}.pt') if not os.path.exists(file_path): raise FileNotFoundError(f'File {file_path} does not exist.') model.load_state_dict(torch.load(file_path)) return model.to(device)
def analogy_inference(device: torch.device, model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel], tokenizer: lmp.tokenizer.BaseTokenizer, word_a: str, word_b: str, word_c: str) -> str: r"""Generate analog word based on `word_a`, `word_b` and `word_c`. This function perform word analogy based on the following rule: `word_a` : `word_b` = `word_c` : `word_d` Where `word_d` is the prediction target. Args: device: Model running device. model: Language model. tokenizer: Converting token (including `word_a`, `word_b` and `word_c`) into token id and convert token id back to token (`word_d`). This is need since we use word embedding layer in our language model. word_a: word_b: word_c: Query words for word analogy. Raises: TypeError: When one of the arguments are not an instance of their type annotation respectively. Returns: Predict word following word analogy. """ # Type check. if not isinstance(device, torch.device): raise TypeError('`device` must be an instance of `torch.device`.') if not isinstance(model, (lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel)): raise TypeError( '`model` must be an instance of ' '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.') if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer): raise TypeError( '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.' ) if not isinstance(word_a, str): raise TypeError('`word_a` must be an instance of `str`.') if not isinstance(word_b, str): raise TypeError('`word_b` must be an instance of `str`.') if not isinstance(word_c, str): raise TypeError('`word_c` must be an instance of `str`.') # Evaluation mode. model.eval() model = model.to(device) # Convert tokens (query words) into token ids. word_a_id = torch.LongTensor([tokenizer.convert_token_to_id(word_a)]) word_b_id = torch.LongTensor([tokenizer.convert_token_to_id(word_b)]) word_c_id = torch.LongTensor([tokenizer.convert_token_to_id(word_c)]) # Perform analogy calculation. # Shape: `(1, E)`. out = (model.emb_layer(word_b_id.to(device)) - model.emb_layer(word_a_id.to(device)) + model.emb_layer(word_c_id.to(device))) # Calculate cosine similarity. # Shape: `(V)`. pred = torch.nn.functional.cosine_similarity( out, model.emb_layer.weight, ) # Get the token id with maximum consine similarity. # Shape: `(1)`. word_d_id = pred.argmax(dim=0).to('cpu').item() # Convert back to token. return tokenizer.convert_id_to_token(word_d_id)
def main() -> None: r"""Script entry point.""" # Parse command-line argument. args = parse_arg() # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer configuration. tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load( exp_name=tknzr_cfg.exp_name, tknzr_name=tknzr_cfg.tknzr_name, ) # Load pre-trained model instance. model = lmp.util.model.load( ckpt=args.ckpt, tknzr=tknzr, **model_cfg.__dict__, ) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Set model to evaluation model. # This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Encode text into token ids. # Wrap as batch with only one sample since `model.ppl` only accept batch. batch_tkids = tknzr.batch_enc( batch_txt=[args.txt], max_seq_len=model_cfg.max_seq_len, ) # Convert token ids to `torch.Tensor` with `dtype == torch.int64`. batch_tkids = torch.LongTensor(batch_tkids) # Move tensors to model running device. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_prev_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Calculate perplexity. ppl = model.ppl( batch_next_tkids=batch_next_tkids, batch_prev_tkids=batch_prev_tkids, ) # Output perplexity on given sample. print(ppl)
def main(argv: List[str]) -> None: """Script entry point. Parameters ---------- argv: list[str] List of CLI arguments. Returns ------- None """ # Parse CLI arguments. args = parse_args(argv=argv) # `args.batch_size` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.batch_size], val_names=['1', 'args.batch_size']) # `args.first_ckpt` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.first_ckpt], val_names=['-1', 'args.first_ckpt']) # `args.last_ckpt` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.last_ckpt], val_names=['-1', 'args.last_ckpt']) # `args.n_worker` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[0, args.n_worker, len(os.sched_getaffinity(0))], val_names=['0', 'args.n_worker', 'number of available CPUs'], ) lmp.util.validate.raise_if_wrong_ordered( vals=[args.n_worker, args.batch_size], val_names=['args.n_worker', 'args.batch_size'], ) # Set random seed for reproducibility. lmp.util.rand.set_seed(seed=args.seed) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name) # Get dataset instance and convert samples to tensor. if args.is_dset_in_memory: dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=model_cfg.max_seq_len, tknzr=tknzr, ) else: dset = lmp.util.dset.SlowTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=model_cfg.max_seq_len, tknzr=tknzr, ) dset_size = len(dset) # Mini-batch sampler. Only when `args.n_worker > 0` we set `persisten_worker = True`. We set # `pin_memory = True` to speed up process (which only speed up a few seconds). data_loader = torch.utils.data.DataLoader( batch_size=args.batch_size, dataset=dset, shuffle=False, num_workers=args.n_worker, persistent_workers=bool(args.n_worker != 0), pin_memory=True, ) # Get tensorboard logger instance. writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) # Evaluate checkpoints within ranges. for ckpt in lmp.util.model.list_ckpts(exp_name=args.exp_name, first_ckpt=args.first_ckpt, last_ckpt=args.last_ckpt): # Load pre-trained model instance. model = lmp.util.model.load(ckpt=ckpt, exp_name=args.exp_name) # Set model to evaluation model. This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Record average perplexity. avg_ppl = 0.0 for batch_tkids in tqdm(data_loader): # Encode text into token ids. We convert token ids into tensor and move to the same running device as model. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_cur_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Loop over token ids to get next token id prediction probability distribution. batch_prev_states = None batch_tkids_pd = [] for i in range(batch_cur_tkids.size(1)): batch_next_tkids_pd, batch_prev_states = model.pred( batch_cur_tkids=batch_cur_tkids[:, i], batch_prev_states=batch_prev_states, ) # Collect prediction probability distribution. batch_tkids_pd.append(batch_next_tkids_pd) # Calculate perplexity. batch_ppl = lmp.util.metric.ppl(batch_tkids=batch_next_tkids, batch_tkids_pd=torch.stack(batch_tkids_pd, dim=1)) # Accumulate average perplexity. avg_ppl += (batch_ppl / dset_size).sum().item() # Log average perplexity on dataset to CLI and tensorboard. writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt) print(f'checkpoint: {ckpt}, avg ppl: {avg_ppl}') # Free memory. This is only need for unit test. del args del avg_ppl del batch_cur_tkids del batch_next_tkids del batch_next_tkids_pd del batch_ppl del batch_prev_states del batch_tkids del batch_tkids_pd del ckpt del data_loader del device del dset del dset_size del model del model_cfg del tknzr del writer torch.cuda.empty_cache() gc.collect()
def main(argv: List[str]) -> None: """Script entry point. Parameters ---------- argv: list[str] List of CLI arguments. Returns ------- None """ # Parse CLI arguments. args = parse_args(argv=argv) # `args.batch_size` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.batch_size], val_names=['1', 'args.batch_size']) # `args.ckpt_step` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.ckpt_step], val_names=['1', 'args.ckpt_step']) # `args.log_step` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.log_step], val_names=['1', 'args.log_step']) # `args.max_norm` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.max_norm], val_names=['0', 'args.max_norm']) # `args.n_epoch` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.n_epoch], val_names=['1', 'args.n_epoch']) # `args.n_worker` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[0, args.n_worker, len(os.sched_getaffinity(0))], val_names=['0', 'args.n_worker', 'number of available CPUs'], ) lmp.util.validate.raise_if_wrong_ordered( vals=[args.n_worker, args.batch_size], val_names=['args.n_worker', 'args.batch_size'], ) # Save training configuration. lmp.util.cfg.save(args=args, exp_name=args.exp_name) # Set random seed for reproducibility. lmp.util.rand.set_seed(seed=args.seed) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Load pre-trained tokenizer. tknzr = lmp.util.tknzr.load(exp_name=args.tknzr_exp_name) # Get dataset instance and convert samples to tensor. if args.is_dset_in_memory: dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=args.max_seq_len, tknzr=tknzr, ) else: dset = lmp.util.dset.SlowTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=args.max_seq_len, tknzr=tknzr, ) # Mini-batch random sampler. Only when `args.n_worker > 0` we set `persisten_worker = True`. We set # `pin_memory = True` to speed up process (which only speed up a few seconds). data_loader = torch.utils.data.DataLoader( batch_size=args.batch_size, dataset=dset, shuffle=True, num_workers=args.n_worker, persistent_workers=bool(args.n_worker != 0), pin_memory=True, ) # Get new model instance and move model to running device. model = lmp.util.model.create(tknzr=tknzr, **args.__dict__) model = model.train() model = model.to(device) # Get new optimizer instance. optim = lmp.util.optim.get_optimizer( beta1=args.beta1, beta2=args.beta2, eps=args.eps, lr=args.lr, model=model, wd=args.wd, ) # Get learning rate scheduler. schdl = lmp.util.optim.get_scheduler( optim=optim, total_step=args.n_epoch * len(data_loader), warmup_step=args.warmup_step, ) # Get tensorboard logger instance. writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) # Log performance target. pre_avg_loss = 0.0 avg_loss = 0.0 # Global optimization step. step = 0 for epoch in range(args.n_epoch): tqdm_data_loader = tqdm(data_loader, desc=f'epoch: {epoch}, loss: {pre_avg_loss:.6f}', dynamic_ncols=True) for batch_tkids in tqdm_data_loader: # Encode batch text into batch token ids. We convert batch token ids into tensor and move to tensor to the same # running device as model. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_cur_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Calculate loss using loss function. loss = model(batch_cur_tkids=batch_cur_tkids, batch_next_tkids=batch_next_tkids) # Accumulate average loss for logging. Use `.item()` to avoid construct tensor graph. avg_loss += loss.item() # Perform backward pass / back propagation. loss.backward() # Perform gradient clipping to avoid gradient explosion. torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.max_norm) # Gradient descent. optim.step() # Update learning rate. schdl.step() # Clean up gradient. optim.zero_grad() # Increment global step. step += 1 # Save checkpoint for each `ckpt_step` step. We move model to CPU first then move back to CUDA device. if step % args.ckpt_step == 0: lmp.util.model.save(ckpt=step, exp_name=args.exp_name, model=copy.deepcopy(model).to('cpu')) # Log performance for each `log_step` step. if step % args.log_step == 0: avg_loss = avg_loss / args.log_step # Log on CLI. tqdm_data_loader.set_description(f'epoch: {epoch}, loss: {avg_loss:.6f}') # Log on tensorboard. writer.add_scalar(f'train-loss/{args.dset_name}/{args.ver}', avg_loss, step) writer.add_scalar('lr', schdl.get_last_lr()[0], step) # Refresh log performance. pre_avg_loss = avg_loss avg_loss = 0.0 # Save last checkpoint. lmp.util.model.save(ckpt=step, exp_name=args.exp_name, model=copy.deepcopy(model).to('cpu')) # Close tensorboard logger. writer.close() # Free memory. This is only need for unit test. del args del avg_loss del batch_cur_tkids del batch_next_tkids del batch_tkids del data_loader del device del dset del loss del model del optim del pre_avg_loss del schdl del step del tknzr del tqdm_data_loader del writer torch.cuda.empty_cache() gc.collect()
def main(argv: List[str]) -> None: """Script entry point. Parameters ---------- argv: list[str] List of CLI arguments. Returns ------- None """ # Parse CLI arguments. args = parse_args(argv=argv) # `args.batch_size` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.batch_size], val_names=['1', 'args.batch_size']) # `args.ckpt_step` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.ckpt_step], val_names=['1', 'args.ckpt_step']) # `args.log_step` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.log_step], val_names=['1', 'args.log_step']) # `args.max_norm` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.max_norm], val_names=['0', 'args.max_norm']) # `args.n_epoch` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.n_epoch], val_names=['1', 'args.n_epoch']) # `args.n_worker` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[0, args.n_worker, len(os.sched_getaffinity(0))], val_names=['0', 'args.n_worker', 'number of available CPUs'], ) lmp.util.validate.raise_if_wrong_ordered( vals=[args.n_worker, args.batch_size], val_names=['args.n_worker', 'args.batch_size'], ) # `args.world_size` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.world_size], val_names=['0', 'args.world_size']) # `args.local_rank` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.local_rank], val_names=['0', 'args.local_rank']) # `args.rank` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[0, args.rank, args.world_size - 1], val_names=['0', 'args.rank', 'args.world_size - 1'], ) # Save training configuration. Only main process need to save configuration. if args.rank == HOST_RANK: lmp.util.cfg.save(args=args, exp_name=args.exp_name) # We use TCP to perform RPC. Timeout is set to 5 minutes. store = dist.TCPStore( is_master=args.rank == HOST_RANK, host_name=args.host_name, port=args.host_port, timeout=timedelta(minutes=5), world_size=args.world_size, ) # Use NCCL backend to perform CUDA collectives. dist.init_process_group( backend=dist.Backend.NCCL, store=store, rank=args.rank, timeout=timedelta(minutes=5), world_size=args.world_size, ) # Sync arguments. dist_args_k = ['host_name', 'host_port', 'local_rank', 'rank', 'world_size'] for k in args.__dict__.keys(): if k in dist_args_k: continue # Host broadcast arguments. if args.rank == HOST_RANK: store.set(k, str(args.__dict__[k])) # Non-host receive host arguments. else: v = store.get(k) if isinstance(args.__dict__[k], str): args.__dict__[k] = v.decode('utf-8') else: args.__dict__[k] = type(args.__dict__[k])(v) # Set random seed for reproducibility. Note that each process use different seed to get different slice of batch. lmp.util.rand.set_seed(seed=args.seed + args.rank) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device(f'cuda:{args.local_rank}') # Load pre-trained tokenizer. tknzr = lmp.util.tknzr.load(exp_name=args.tknzr_exp_name) # Get dataset instance and convert samples to tensor. if args.is_dset_in_memory: dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=args.max_seq_len, tknzr=tknzr, ) else: dset = lmp.util.dset.SlowTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=args.max_seq_len, tknzr=tknzr, ) # Mini-batch sampler. Each process will get batches exclusive to itself. dist_sampler = torch.utils.data.distributed.DistributedSampler( num_replicas=args.world_size, rank=args.rank, dataset=dset, shuffle=True, ) # Mini-batch distributed random sampler. Only when `args.n_worker > 0` we set `persisten_worker = True`. We set # `pin_memory = True` to speed up process (which only speed up a few seconds). data_loader = torch.utils.data.DataLoader( batch_size=args.batch_size // args.world_size, dataset=dset, num_workers=args.n_worker, persistent_workers=bool(args.n_worker != 0), pin_memory=True, sampler=dist_sampler, ) # Get new model instance and move model to running device. model = lmp.util.model.create(tknzr=tknzr, **args.__dict__) model = model.train() model = model.to(device) # Get new optimizer instance. optim = lmp.util.optim.get_optimizer( beta1=args.beta1, beta2=args.beta2, eps=args.eps, lr=args.lr, model=model, wd=args.wd, ) # Get learning rate scheduler. schdl = lmp.util.optim.get_scheduler( optim=optim, total_step=args.n_epoch * len(data_loader), warmup_step=args.warmup_step, ) # Create DDP model. ddp_model = torch.nn.parallel.DistributedDataParallel(model) # Get tensorboard logger instance. Only main process need to log performance. if args.rank == HOST_RANK: writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) else: writer = None # Log performance target. pre_avg_loss = 0.0 avg_loss = 0.0 # Global optimization step. step = 0 for epoch in range(args.n_epoch): # Update random sample order. dist_sampler.set_epoch(epoch) # Processes can have unevenly distributed number of batch. Thus one must use `ddp_model.join()` to avoid dead lock. with ddp_model.join(): tqdm_data_loader = tqdm(data_loader, desc=f'epoch: {epoch}, loss: {pre_avg_loss:.6f}', dynamic_ncols=True) for batch_tkids in tqdm_data_loader: # Encode batch text into batch token ids. We convert batch token ids into tensor and move to tensor to the same # running device as model. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_cur_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Calculate loss using loss function. loss = ddp_model(batch_cur_tkids=batch_cur_tkids, batch_next_tkids=batch_next_tkids) # Accumulate average loss for logging. Use `.item()` to avoid construct tensor graph. avg_loss += loss.item() # Perform backward pass / back propagation. loss.backward() # Perform gradient clipping to avoid gradient explosion. torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.max_norm) # Gradient descent. optim.step() # Update learning rate. schdl.step() # Clean up gradient. optim.zero_grad() # Increment global step. step += 1 # Save checkpoint for each `ckpt_step` step. We move model to CPU first then move back to CUDA device. Only # main process need to save checkpoint. if args.rank == HOST_RANK and step % args.ckpt_step == 0: lmp.util.model.save(ckpt=step, exp_name=args.exp_name, model=copy.deepcopy(model).to('cpu')) # Log performance for each `log_step` step. if step % args.log_step == 0: avg_loss = avg_loss / args.log_step # Log on CLI. tqdm_data_loader.set_description(f'epoch: {epoch}, loss: {avg_loss:.6f}') # Log on tensorboard. Only main process need to log performance. if args.rank == HOST_RANK: writer.add_scalar(f'train-loss/{args.dset_name}/{args.ver}', avg_loss, step) writer.add_scalar('lr', schdl.get_last_lr()[0], step) # Refresh log performance. pre_avg_loss = avg_loss avg_loss = 0.0 # Save last checkpoint. Only main process need to save checkpoint. if args.rank == HOST_RANK: lmp.util.model.save(ckpt=step, exp_name=args.exp_name, model=copy.deepcopy(model).to('cpu')) # Close tensorboard logger. writer.close() # Free memory. This is only need for unit test. del args del avg_loss del batch_cur_tkids del batch_next_tkids del batch_tkids del data_loader del device del dist_args_k del dist_sampler del ddp_model del dset del loss del model del optim del pre_avg_loss del schdl del step del store del tknzr del tqdm_data_loader del writer torch.cuda.empty_cache() gc.collect()
def perplexity_eval(device: torch.device, model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel], sequence: str, tokenizer: lmp.tokenizer.BaseTokenizer) -> float: r"""Helper function for calculating perplexity. Args: device: Model running device. model: Language model. sequence: Sequence for evaluation. Must not be empty. tokenizer: Tokenizer for encoding sequence. Raises: TypeError: When one of the arguments are not an instance of their type annotation respectively. Return: Perplexity of `sequence`. """ # Type check. if not isinstance(device, torch.device): raise TypeError('`device` must be an instance of `torch.device`.') if not isinstance(model, (lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel)): raise TypeError( '`model` must be an instance of ' '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.') if not isinstance(sequence, str): raise TypeError('`sequence` must be an instance of `str`.') if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer): raise TypeError( '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.' ) # Value check. if not sequence: raise ValueError('`sequence` must not be empty.') # Evalation mode. model.eval() model = model.to(device) # Encode sequence and convert into tensor. Original sequence length: S. # New sequence length: S + 2. sequence = tokenizer.encode(sequence, max_seq_len=-1) # `sequence[:-2]` means predict tokens include [bos] output but exclude # [eos] input. `x.shape = (S)`. x = torch.LongTensor(sequence[:-2]).to(device) # `y.shape = (S)`. y = sequence[1:-1] # Reshape into `(1, S)` to fit model. x = x.reshape(1, -1) # Get model vocabulary prediction with shape `(1, S, V)`. pred_y = model.predict(x) # Reshape into `(S)` for easier maniplation. x = x.squeeze(0) # Reshape into `(S, V)` for easier maniplation. pred_y = pred_y.squeeze(0) # Accumulate negative log-likelihood. nll = torch.zeros(1).to(device) # Iterate through each prediction. for pos, token_id in enumerate(y): probs = pred_y[pos, token_id] nll = nll - torch.log(probs) # Normalized by length. nll = nll / x.size(0) # Take exponential to cancel logarithmic. return nll.exp().item()