def test(args, model, device, test_loader, epoch): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss( output, target, reduction="sum" ).item() # sum up batch loss pred = output.argmax( dim=1, keepdim=True ) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) Logger.current_logger().report_scalar( "test", "loss", iteration=epoch, value=test_loss ) Logger.current_logger().report_scalar( "test", "accuracy", iteration=epoch, value=(correct / len(test_loader.dataset)) ) print( "Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format( test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset), ) )
def train(args, model, device, train_loader, optimizer, epoch): save_loss = [] model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() save_loss.append(loss) optimizer.step() if batch_idx % args.log_interval == 0: Logger.current_logger().report_scalar( "train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item()) print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) # Add manual scalar reporting for loss metrics Logger.current_logger().report_scalar( title='Scalar example {} - epoch'.format(epoch), series='Loss', value=loss.item(), iteration=batch_idx)
def train(args, model, device, train_loader, optimizer, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: Logger.current_logger().report_scalar( "train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item(), ) print( "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( epoch, batch_idx * len(data), len(train_loader.dataset), 100.0 * batch_idx / len(train_loader), loss.item(), ) )
def test(model, device, criterion, test_loader, epoch): model.eval() epoch_loss = 0 epoch_acc = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data.float()) loss = criterion(output, target.float()) acc = binary_acc(output, target.float()) epoch_loss += loss.item() epoch_acc += acc.item() #print(f'Epoch(TEST) {epoch+0:03}: | Loss: {epoch_loss/len(test_loader):.8f} | Acc: {epoch_acc/len(test_loader):.3f}') Logger.current_logger().report_scalar("test", "loss", iteration=epoch, value=epoch_loss) Logger.current_logger().report_scalar("test", "accuracy", iteration=epoch, value=(acc / len(test_loader.dataset)))
def log_trajectories(model): truth_position = model.data_saver["truth_position"] predicted_position = model.data_saver["predicted_position"] Logger.current_logger().report_scatter3d(title="trajectory", series="truth_positions", iteration=1, scatter=truth_position, xaxis="x", yaxis="y", zaxis="z", mode="lines") Logger.current_logger().report_scatter3d(title="trajectory", series="predicted_position", iteration=1, scatter=predicted_position, xaxis="x", yaxis="y", zaxis="z", mode="lines")
def test(args, model, device, test_loader, epoch): save_test_loss = [] save_correct = [] model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss( output, target, reduction='sum').item() # sum up batch loss pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() save_test_loss.append(test_loss) save_correct.append(correct) test_loss /= len(test_loader.dataset) Logger.current_logger().report_scalar("test", "loss", iteration=epoch, value=test_loss) Logger.current_logger().report_scalar("test", "accuracy", iteration=epoch, value=(correct / len(test_loader.dataset))) Logger.current_logger().report_histogram(title='Histogram example', series='correct', iteration=1, values=save_correct, xaxis='Test', yaxis='Correct') matrix = np.array([save_test_loss, save_correct]) Logger.current_logger().report_confusion_matrix( title='Confusion matrix example', series='Test loss / correct', iteration=1, matrix=matrix) print( '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
def train(model, device, train_loader, criterion, optimizer, epoch): epoch_loss = 0 epoch_acc = 0 model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data.float()) loss = criterion(output, target.float()) loss.backward() optimizer.step() acc = binary_acc(output, target.float()) epoch_loss += loss.item() epoch_acc += acc.item() Logger.current_logger().report_scalar( "train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item())
def main(): # Connecting ClearML with the current process, # from here on everything is logged automatically import os os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin" os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin" task = Task.init(project_name="args.project_name", task_name="args.task_name") task.set_base_docker("harbor.io/nvidia/cuda:10.1-devel-ubuntu18.04") task.execute_remotely(queue_name="gpu", exit_process=True) # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument( '--project-name', type=str, default='MNIST', help='ML Task Name, such as Classification of numbers in MNIST') parser.add_argument('--task-name', type=str, default='2 layer CNN', help='Technique to test, such as 2 layer CNN') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=5, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser.parse_args() # task = Task.init(project_name=args.project_name, task_name=args.task_name) # task.set_base_docker("nvidia/cuda:10.1-runtime-ubuntu18.04") # task.execute_remotely(queue_name="gpu", exit_process=True) use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( os.path.join('.', 'data'), train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( os.path.join('..', 'data'), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader, epoch) if (args.save_model): torch.save(model.state_dict(), os.path.join(gettempdir(), "mnist_cnn.pt")) Logger.current_logger().report_text( 'The default output destination for model snapshots and artifacts is: {}' .format(model_snapshots_path))
def validate(val_loader, encoder, decoder, criterion, epoch): """ Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :return: BLEU-4 score """ decoder.eval() # eval mode (no dropout or batchnorm) if encoder is not None: encoder.eval() batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() # print('during_validation') start = time.time() references = list( ) # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) # explicitly disable gradient calculation to avoid CUDA memory error # solves the issue #57 with torch.no_grad(): # Batches for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader): # Move to device, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. if encoder is not None: imgs = encoder(imgs) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder( imgs, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores_copy = scores.clone() scores = pack_padded_sequence( scores, decode_lengths, batch_first=True ).data #Replace this with the below two options targets = pack_padded_sequence( targets, decode_lengths, batch_first=True ).data #Replace this with the below two options # scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True) # targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate loss loss = criterion(scores, targets) # # Add doubly stochastic attention regularization This is on in the OG loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # Keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print( 'Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) # Store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] # References allcaps = allcaps[ sort_ind] # because images were sorted in the decoder for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in {word_map['<start>'], word_map['<pad>']} ], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads preds = temp_preds hypotheses.extend(preds) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) Logger.current_logger().report_scalar( "val", "loss", iteration=(epoch * len(val_loader) + i), value=loss.item()) Logger.current_logger().report_scalar( title='val', series='top_5_accuracy', value=top5, iteration=(epoch * len(val_loader) + i)) Logger.current_logger().report_scalar( title='val', series='bleu4', value=bleu4, iteration=(epoch * len(val_loader) + i)) print( '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n' .format(loss=losses, top5=top5accs, bleu=bleu4)) return bleu4
def train(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch): """ Performs one epoch's training. :param train_loader: DataLoader for training data :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning) :param decoder_optimizer: optimizer to update decoder's weights :param epoch: epoch number """ decoder.train() # train mode (dropout and batchnorm is used) encoder.train() batch_time = AverageMeter() # forward prop. + back prop. time data_time = AverageMeter() # data loading time losses = AverageMeter() # loss (per word decoded) top5accs = AverageMeter() # top5 accuracy start = time.time() # Batches for i, (imgs, caps, caplens) in enumerate(train_loader): data_time.update(time.time() - start) # Move to GPU, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. imgs = encoder(imgs) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder( imgs, caps, caplens) # print(f'Alphas is {alphas}') # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores = pack_padded_sequence( scores, decode_lengths, batch_first=True).data #Replace this with the below two options targets = pack_padded_sequence( targets, decode_lengths, batch_first=True).data #Replace this with the below two options # scores = pack_padded_sequence(scores, decode_lengths, batch_first=True) # targets = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate loss loss = criterion(scores, targets) # # Add doubly stochastic attention regularization this is on in the OG loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # Back prop. decoder_optimizer.zero_grad() if encoder_optimizer is not None: encoder_optimizer.zero_grad() loss.backward() # Clip gradients if grad_clip is not None: clip_gradient(decoder_optimizer, grad_clip) if encoder_optimizer is not None: clip_gradient(encoder_optimizer, grad_clip) # Update weights decoder_optimizer.step() if encoder_optimizer is not None: encoder_optimizer.step() # Keep track of metrics top5 = accuracy(scores, targets, 5) losses.update(loss.item(), sum(decode_lengths)) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % 5 == 0: Logger.current_logger().report_scalar( "train", "loss", iteration=(epoch * len(train_loader) + i), value=loss.item()) Logger.current_logger().report_scalar( title='train', series='top_5_accuracy', value=top5, iteration=(epoch * len(train_loader) + i)) tensorboard_writer.add_scalar('loss/epoch', loss, epoch) # Print status if i % print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top5=top5accs))
# import os from clearml import Task, Logger # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name="examples", task_name="Audio and video reporting") print('reporting audio and video samples to the debug samples section') # report video, an already uploaded video media (url) Logger.current_logger().report_media( 'video', 'big bunny', iteration=1, url= 'https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_1MB.mp4' ) # report audio, report an already uploaded audio media (url) Logger.current_logger().report_media( 'audio', 'pink panther', iteration=1, url='https://www2.cs.uic.edu/~i101/SoundFiles/PinkPanther30.wav') # report audio, report local media audio file Logger.current_logger().report_media('audio', 'tada', iteration=1,