def main(path_to_data: str, cache_dir: str, texts_col: str, labels_col: str, n_classes: int, batch_size: int, batch_size_eval: int, min_lr: int, max_lr: int, n_epochs: int, cuda: int = 0): ''' ''' df = pd.read_csv(path_to_data) if os.path.isdir(cache_dir): logger.info('Cache dir found here {}'.format(cache_dir)) pass else: logger.info('Creating cache dir') os.mkdir(cache_dir) # Preprocess optimal_length = get_length(df, texts_col) X, vocab_size = encode_texts(df, texts_col, max_seq_length=optimal_length, return_vocab_size=True) y = get_labels(df, labels_col, n_classes) train_loader, test_loader = create_TorchLoaders( X, y, test_size=0.10, batch_size=batch_size, batch_size_eval=batch_size_eval) Model = LSTMModel(vocab_size=vocab_size, n_classes=n_classes) config_dict = { "vocab_size": vocab_size, "n_classes": n_classes, "max_length": optimal_length } if n_classes > 2: criterion = torch.nn.CrossEntropyLoss() else: criterion = torch.nn.BCEWithLogitsLoss() optim = torch.optim.Adam(Model.parameters()) ## Heuristic opt_cycle = ((((len(X) * (1 - 0.10)) / batch_size) * n_epochs) * 0.25) / 2 schedul = torch.optim.lr_scheduler.CyclicLR(optim, min_lr, max_lr, step_size_up=opt_cycle, step_size_down=opt_cycle, mode="exp_range", cycle_momentum=False, gamma=0.999) if cuda == 1: Model.cuda() device = "cuda" else: device = "cpu" metrics = { "training_loss": [], "eval_loss": [], "training_f1": [], "eval_f1": [] } logger.info("Starting training for {} epochs".format(n_epochs)) for epoch in range(n_epochs): Model.train() progress = progressbar.ProgressBar() for batch in progress(train_loader): batch = tuple(t for t in batch) inputs, labels = batch #unpacking inputs = inputs.to(device, dtype=torch.long) labels = labels.to(device, dtype=torch.float) preds = Model(inputs) loss = criterion(preds, labels) ## Metrics computation metrics["training_loss"].append(loss.item()) preds = preds.to("cpu").detach().numpy() preds = flat_pred(preds, 0.5) tmp_f1 = f1_score(labels.to("cpu").detach().numpy(), preds, average='macro') metrics["training_f1"].append(tmp_f1) ## Backward pass ## loss.backward() optim.step() #Gradient descent schedul.step() Model.zero_grad() logger.info( "Epoch {} done with: training loss: {}\n training f1: {}".format( epoch, loss.item(), tmp_f1)) ## Eval progress = progressbar.ProgressBar() Model.eval() for batch in progress(test_loader): with torch.no_grad(): #computationaly efficient batch = tuple(t for t in batch) inputs, labels = batch inputs = inputs.to(device, dtype=torch.long) labels = labels.to(device, dtype=torch.float) preds = Model(inputs) eval_loss = criterion(preds, labels) ## Eval metrics metrics["eval_loss"].append(eval_loss.item()) preds = preds.to("cpu").detach().numpy() preds = flat_pred(preds, 0.5) tmp_f1 = f1_score(labels.to("cpu").detach().numpy(), preds, average='macro') ## detach metrics["eval_f1"].append(tmp_f1) logger.info( "Evaluation at iteration {} done: eval loss: {}\n eval f1: {}". format(epoch, eval_loss.item(), tmp_f1)) ## Bring back model to cpu Model.cpu() ## Get/Save param dict logger.info('Saving model in cache dir {}'.format(cache_dir)) torch.save(Model.state_dict(), os.path.join(cache_dir, 'state_dict.pt')) with open(os.path.join(cache_dir, 'config_model.json'), 'w') as file: json.dump(config_dict, file)
def main(args): use_cuda = (len(args.gpuid) >= 1) if args.gpuid: cuda.set_device(args.gpuid[0]) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset(args.data, splits, args.src_lang, args.trg_lang) else: dataset = data.load_raw_text_dataset(args.data, splits, args.src_lang, args.trg_lang) if args.src_lang is None or args.trg_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.src_lang, args.trg_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) # Set model parameters args.encoder_embed_dim = 1000 args.encoder_layers = 4 args.encoder_dropout_out = 0 args.decoder_embed_dim = 1000 args.decoder_layers = 4 args.decoder_out_embed_dim = 1000 args.decoder_dropout_out = 0 args.bidirectional = False logging_meters = OrderedDict() logging_meters['train_loss'] = AverageMeter() logging_meters['valid_loss'] = AverageMeter() logging_meters['bsz'] = AverageMeter() # sentences per batch # Build model generator = LSTMModel(args, dataset.src_dict, dataset.dst_dict, use_cuda=use_cuda) if use_cuda: generator.cuda() else: generator.cpu() optimizer = eval("torch.optim." + args.optimizer)(generator.parameters(), args.learning_rate) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=0, factor=args.lr_shrink) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf epoch_i = 1 best_dev_loss = math.inf lr = optimizer.param_groups[0]['lr'] # main training loop # added for write training loss f1 = open("train_loss", "a") while lr > args.min_lr and epoch_i <= max_epoch: logging.info("At {0}-th epoch.".format(epoch_i)) seed = args.seed + epoch_i torch.manual_seed(seed) max_positions_train = (min(args.max_source_positions, generator.encoder.max_positions()), min(args.max_target_positions, generator.decoder.max_positions())) # Initialize dataloader, starting at batch_offset itr = dataset.train_dataloader( 'train', max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions_train, seed=seed, epoch=epoch_i, sample_without_replacement=args.sample_without_replacement, sort_by_source_size=(epoch_i <= args.curriculum), shard_id=args.distributed_rank, num_shards=args.distributed_world_size, ) # set training mode generator.train() # reset meters for key, val in logging_meters.items(): if val is not None: val.reset() for i, sample in enumerate(itr): if use_cuda: # wrap input tensors in cuda tensors sample = utils.make_variable(sample, cuda=cuda) loss = generator(sample) sample_size = sample['target'].size( 0) if args.sentence_avg else sample['ntokens'] nsentences = sample['target'].size(0) logging_loss = loss.data / sample_size / math.log(2) logging_meters['bsz'].update(nsentences) logging_meters['train_loss'].update(logging_loss, sample_size) f1.write("{0}\n".format(logging_meters['train_loss'].avg)) logging.debug( "loss at batch {0}: {1:.3f}, batch size: {2}, lr={3}".format( i, logging_meters['train_loss'].avg, round(logging_meters['bsz'].avg), optimizer.param_groups[0]['lr'])) optimizer.zero_grad() loss.backward() # all-reduce grads and rescale by grad_denom for p in generator.parameters(): if p.requires_grad: p.grad.data.div_(sample_size) torch.nn.utils.clip_grad_norm(generator.parameters(), args.clip_norm) optimizer.step() # validation -- this is a crude estimation because there might be some padding at the end max_positions_valid = ( generator.encoder.max_positions(), generator.decoder.max_positions(), ) # Initialize dataloader itr = dataset.eval_dataloader( 'valid', max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions_valid, skip_invalid_size_inputs_valid_test=args. skip_invalid_size_inputs_valid_test, descending=True, # largest batch first to warm the caching allocator shard_id=args.distributed_rank, num_shards=args.distributed_world_size, ) # set validation mode generator.eval() # reset meters for key, val in logging_meters.items(): if val is not None: val.reset() for i, sample in enumerate(itr): with torch.no_grad(): if use_cuda: # wrap input tensors in cuda tensors sample = utils.make_variable(sample, cuda=cuda) loss = generator(sample) sample_size = sample['target'].size( 0) if args.sentence_avg else sample['ntokens'] loss = loss / sample_size / math.log(2) logging_meters['valid_loss'].update(loss, sample_size) logging.debug("dev loss at batch {0}: {1:.3f}".format( i, logging_meters['valid_loss'].avg)) # update learning rate lr_scheduler.step(logging_meters['valid_loss'].avg) lr = optimizer.param_groups[0]['lr'] logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(logging_meters['valid_loss'].avg, epoch_i)) torch.save( generator.state_dict(), open( args.model_file + "data.nll_{0:.3f}.epoch_{1}.pt".format( logging_meters['valid_loss'].avg, epoch_i), 'wb')) if logging_meters['valid_loss'].avg < best_dev_loss: best_dev_loss = logging_meters['valid_loss'].avg torch.save(generator.state_dict(), open(args.model_file + "best_gmodel.pt", 'wb')) epoch_i += 1 f1.close()